diff --git "a/2D/log_1105_2d_random.job" "b/2D/log_1105_2d_random.job" new file mode 100644--- /dev/null +++ "b/2D/log_1105_2d_random.job" @@ -0,0 +1,760 @@ +nohup: ignoring input +/root/miniconda3/envs/med/lib/python3.9/site-packages/transformers/utils/generic.py:441: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + _torch_pytree._register_pytree_node( +/root/miniconda3/envs/med/lib/python3.9/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + _torch_pytree._register_pytree_node( +/root/miniconda3/envs/med/lib/python3.9/site-packages/timm/models/layers/__init__.py:48: FutureWarning: Importing from timm.models.layers is deprecated, please import via timm.layers + warnings.warn(f"Importing from {__name__} is deprecated, please import via timm.layers", FutureWarning) +/root/miniconda3/envs/med/lib/python3.9/site-packages/huggingface_hub/file_download.py:943: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. + warnings.warn( +/root/miniconda3/envs/med/lib/python3.9/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:432: UserWarning: ModelCheckpoint(save_last=True, save_top_k=None, monitor=None) is a redundant configuration. You can save the last checkpoint with ModelCheckpoint(save_top_k=None, monitor=None). + rank_zero_warn( +GPU available: True, used: True +TPU available: False, using: 0 TPU cores +IPU available: False, using: 0 IPUs +[patch] torch.load monkey-patched for legacy Lightning/NumPy ckpt +Running on GPUs 0,1,2,3,4,5,6,7 +LatentDiffusion: Running in eps-prediction mode +DiffusionWrapper has 859.53 M params. +making attention of type 'vanilla' with 512 in_channels +Working with z of shape (1, 4, 32, 32) = 4096 dimensions. +making attention of type 'vanilla' with 512 in_channels +Manual init: model.diffusion_model.input_blocks.0.0.weight +Deleting key model.diffusion_model.input_blocks.0.0.weight from state_dict. +Restored from stable_diffusion/models/ldm/stable-diffusion-v1/v1-5-pruned-emaonly.ckpt with 352 missing and 199 unexpected keys +Missing Keys: ['model.diffusion_model.input_blocks.0.0.weight', 'cond_stage_model.model.logit_scale', 'cond_stage_model.model.visual.trunk.cls_token', 'cond_stage_model.model.visual.trunk.pos_embed', 'cond_stage_model.model.visual.trunk.patch_embed.proj.weight', 'cond_stage_model.model.visual.trunk.patch_embed.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.0.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.0.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.0.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.0.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.0.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.0.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.0.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.0.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.0.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.0.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.0.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.0.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.1.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.1.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.1.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.1.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.1.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.1.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.1.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.1.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.1.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.1.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.1.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.1.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.2.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.2.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.2.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.2.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.2.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.2.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.2.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.2.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.2.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.2.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.2.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.2.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.3.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.3.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.3.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.3.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.3.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.3.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.3.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.3.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.3.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.3.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.3.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.3.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.4.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.4.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.4.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.4.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.4.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.4.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.4.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.4.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.4.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.4.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.4.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.4.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.5.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.5.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.5.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.5.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.5.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.5.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.5.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.5.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.5.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.5.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.5.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.5.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.6.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.6.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.6.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.6.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.6.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.6.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.6.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.6.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.6.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.6.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.6.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.6.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.7.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.7.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.7.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.7.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.7.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.7.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.7.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.7.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.7.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.7.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.7.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.7.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.8.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.8.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.8.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.8.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.8.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.8.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.8.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.8.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.8.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.8.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.8.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.8.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.9.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.9.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.9.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.9.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.9.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.9.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.9.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.9.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.9.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.9.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.9.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.9.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.10.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.10.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.10.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.10.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.10.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.10.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.10.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.10.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.10.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.10.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.10.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.10.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.11.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.11.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.11.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.11.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.11.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.11.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.11.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.11.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.11.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.11.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.11.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.11.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.norm.weight', 'cond_stage_model.model.visual.trunk.norm.bias', 'cond_stage_model.model.visual.head.proj.weight', 'cond_stage_model.model.text.transformer.embeddings.word_embeddings.weight', 'cond_stage_model.model.text.transformer.embeddings.position_embeddings.weight', 'cond_stage_model.model.text.transformer.embeddings.token_type_embeddings.weight', 'cond_stage_model.model.text.transformer.embeddings.LayerNorm.weight', 'cond_stage_model.model.text.transformer.embeddings.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.output.LayerNorm.bias', 'cond_stage_model.model.text.proj.0.weight', 'cond_stage_model.model.text.proj.2.weight'] +Unexpected Keys: ['model_ema.decay', 'model_ema.num_updates', 'cond_stage_model.transformer.text_model.embeddings.position_ids', 'cond_stage_model.transformer.text_model.embeddings.token_embedding.weight', 'cond_stage_model.transformer.text_model.embeddings.position_embedding.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm2.bias', 'cond_stage_model.transformer.text_model.final_layer_norm.weight', 'cond_stage_model.transformer.text_model.final_layer_norm.bias'] +Keeping EMAs of 688. +Merged modelckpt-cfg: +{'target': 'pytorch_lightning.callbacks.ModelCheckpoint', 'params': {'dirpath': 'logs/train_instructpix2pix_2d_random/checkpoints', 'filename': '{epoch:06}', 'verbose': True, 'save_last': True}} +Caution: Saving checkpoints every n train steps without deleting. This might require some free space. +[patch] torch.load monkey-patched for legacy Lightning/NumPy ckpt + 0it [00:00, ?it/s] 6342it [00:00, 1278675.00it/s] +5073 + 0it [00:00, ?it/s] 361it [00:00, 3607.84it/s] 370it [00:00, 3600.29it/s] + 0it [00:00, ?it/s] 6342it [00:00, 1232921.25it/s] +5073 + 0it [00:00, ?it/s] 359it [00:00, 3580.40it/s] 370it [00:00, 3574.07it/s] +/root/miniconda3/envs/med/lib/python3.9/site-packages/transformers/utils/generic.py:441: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + _torch_pytree._register_pytree_node( +/root/miniconda3/envs/med/lib/python3.9/site-packages/transformers/utils/generic.py:441: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + _torch_pytree._register_pytree_node( +/root/miniconda3/envs/med/lib/python3.9/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + _torch_pytree._register_pytree_node( +/root/miniconda3/envs/med/lib/python3.9/site-packages/timm/models/layers/__init__.py:48: FutureWarning: Importing from timm.models.layers is deprecated, please import via timm.layers + warnings.warn(f"Importing from {__name__} is deprecated, please import via timm.layers", FutureWarning) +/root/miniconda3/envs/med/lib/python3.9/site-packages/huggingface_hub/file_download.py:943: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. + warnings.warn( +/root/miniconda3/envs/med/lib/python3.9/site-packages/transformers/utils/generic.py:441: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + _torch_pytree._register_pytree_node( +/root/miniconda3/envs/med/lib/python3.9/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + _torch_pytree._register_pytree_node( +/root/miniconda3/envs/med/lib/python3.9/site-packages/timm/models/layers/__init__.py:48: FutureWarning: Importing from timm.models.layers is deprecated, please import via timm.layers + warnings.warn(f"Importing from {__name__} is deprecated, please import via timm.layers", FutureWarning) +/root/miniconda3/envs/med/lib/python3.9/site-packages/huggingface_hub/file_download.py:943: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. + warnings.warn( +/root/miniconda3/envs/med/lib/python3.9/site-packages/transformers/utils/generic.py:441: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + _torch_pytree._register_pytree_node( +/root/miniconda3/envs/med/lib/python3.9/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + _torch_pytree._register_pytree_node( +/root/miniconda3/envs/med/lib/python3.9/site-packages/timm/models/layers/__init__.py:48: FutureWarning: Importing from timm.models.layers is deprecated, please import via timm.layers + warnings.warn(f"Importing from {__name__} is deprecated, please import via timm.layers", FutureWarning) +[patch] torch.load monkey-patched for legacy Lightning/NumPy ckpt +Running on GPUs 0,1,2,3,4,5,6,7 +LatentDiffusion: Running in eps-prediction mode +DiffusionWrapper has 859.53 M params. +making attention of type 'vanilla' with 512 in_channels +Working with z of shape (1, 4, 32, 32) = 4096 dimensions. +making attention of type 'vanilla' with 512 in_channels +Manual init: model.diffusion_model.input_blocks.0.0.weight +Deleting key model.diffusion_model.input_blocks.0.0.weight from state_dict. +Restored from stable_diffusion/models/ldm/stable-diffusion-v1/v1-5-pruned-emaonly.ckpt with 352 missing and 199 unexpected keys +Missing Keys: ['model.diffusion_model.input_blocks.0.0.weight', 'cond_stage_model.model.logit_scale', 'cond_stage_model.model.visual.trunk.cls_token', 'cond_stage_model.model.visual.trunk.pos_embed', 'cond_stage_model.model.visual.trunk.patch_embed.proj.weight', 'cond_stage_model.model.visual.trunk.patch_embed.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.0.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.0.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.0.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.0.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.0.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.0.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.0.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.0.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.0.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.0.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.0.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.0.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.1.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.1.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.1.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.1.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.1.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.1.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.1.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.1.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.1.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.1.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.1.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.1.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.2.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.2.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.2.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.2.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.2.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.2.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.2.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.2.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.2.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.2.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.2.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.2.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.3.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.3.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.3.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.3.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.3.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.3.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.3.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.3.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.3.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.3.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.3.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.3.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.4.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.4.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.4.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.4.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.4.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.4.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.4.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.4.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.4.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.4.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.4.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.4.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.5.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.5.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.5.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.5.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.5.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.5.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.5.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.5.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.5.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.5.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.5.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.5.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.6.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.6.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.6.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.6.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.6.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.6.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.6.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.6.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.6.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.6.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.6.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.6.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.7.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.7.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.7.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.7.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.7.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.7.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.7.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.7.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.7.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.7.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.7.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.7.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.8.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.8.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.8.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.8.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.8.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.8.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.8.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.8.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.8.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.8.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.8.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.8.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.9.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.9.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.9.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.9.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.9.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.9.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.9.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.9.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.9.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.9.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.9.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.9.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.10.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.10.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.10.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.10.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.10.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.10.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.10.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.10.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.10.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.10.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.10.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.10.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.11.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.11.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.11.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.11.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.11.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.11.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.11.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.11.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.11.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.11.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.11.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.11.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.norm.weight', 'cond_stage_model.model.visual.trunk.norm.bias', 'cond_stage_model.model.visual.head.proj.weight', 'cond_stage_model.model.text.transformer.embeddings.word_embeddings.weight', 'cond_stage_model.model.text.transformer.embeddings.position_embeddings.weight', 'cond_stage_model.model.text.transformer.embeddings.token_type_embeddings.weight', 'cond_stage_model.model.text.transformer.embeddings.LayerNorm.weight', 'cond_stage_model.model.text.transformer.embeddings.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.output.LayerNorm.bias', 'cond_stage_model.model.text.proj.0.weight', 'cond_stage_model.model.text.proj.2.weight'] +Unexpected Keys: ['model_ema.decay', 'model_ema.num_updates', 'cond_stage_model.transformer.text_model.embeddings.position_ids', 'cond_stage_model.transformer.text_model.embeddings.token_embedding.weight', 'cond_stage_model.transformer.text_model.embeddings.position_embedding.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm2.bias', 'cond_stage_model.transformer.text_model.final_layer_norm.weight', 'cond_stage_model.transformer.text_model.final_layer_norm.bias'] +Keeping EMAs of 688. +Merged modelckpt-cfg: +{'target': 'pytorch_lightning.callbacks.ModelCheckpoint', 'params': {'dirpath': 'logs/train_instructpix2pix_2d_random/checkpoints', 'filename': '{epoch:06}', 'verbose': True, 'save_last': True}} +Caution: Saving checkpoints every n train steps without deleting. This might require some free space. +[patch] torch.load monkey-patched for legacy Lightning/NumPy ckpt + 0it [00:00, ?it/s] 6342it [00:00, 1280891.61it/s] +5073 + 0it [00:00, ?it/s]/root/miniconda3/envs/med/lib/python3.9/site-packages/huggingface_hub/file_download.py:943: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. + warnings.warn( + 356it [00:00, 3550.70it/s] 370it [00:00, 3546.50it/s] + 0it [00:00, ?it/s] 6342it [00:00, 1202055.04it/s] +5073 + 0it [00:00, ?it/s] 342it [00:00, 3418.71it/s] 370it [00:00, 3425.19it/s] +initializing ddp: GLOBAL_RANK: 1, MEMBER: 2/8 +/root/miniconda3/envs/med/lib/python3.9/site-packages/transformers/utils/generic.py:441: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + _torch_pytree._register_pytree_node( +/root/miniconda3/envs/med/lib/python3.9/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + _torch_pytree._register_pytree_node( +/root/miniconda3/envs/med/lib/python3.9/site-packages/timm/models/layers/__init__.py:48: FutureWarning: Importing from timm.models.layers is deprecated, please import via timm.layers + warnings.warn(f"Importing from {__name__} is deprecated, please import via timm.layers", FutureWarning) +[patch] torch.load monkey-patched for legacy Lightning/NumPy ckpt +Running on GPUs 0,1,2,3,4,5,6,7 +LatentDiffusion: Running in eps-prediction mode +DiffusionWrapper has 859.53 M params. +making attention of type 'vanilla' with 512 in_channels +Working with z of shape (1, 4, 32, 32) = 4096 dimensions. +making attention of type 'vanilla' with 512 in_channels +Manual init: model.diffusion_model.input_blocks.0.0.weight +Deleting key model.diffusion_model.input_blocks.0.0.weight from state_dict. +Restored from stable_diffusion/models/ldm/stable-diffusion-v1/v1-5-pruned-emaonly.ckpt with 352 missing and 199 unexpected keys +Missing Keys: ['model.diffusion_model.input_blocks.0.0.weight', 'cond_stage_model.model.logit_scale', 'cond_stage_model.model.visual.trunk.cls_token', 'cond_stage_model.model.visual.trunk.pos_embed', 'cond_stage_model.model.visual.trunk.patch_embed.proj.weight', 'cond_stage_model.model.visual.trunk.patch_embed.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.0.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.0.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.0.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.0.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.0.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.0.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.0.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.0.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.0.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.0.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.0.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.0.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.1.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.1.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.1.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.1.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.1.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.1.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.1.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.1.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.1.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.1.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.1.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.1.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.2.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.2.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.2.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.2.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.2.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.2.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.2.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.2.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.2.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.2.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.2.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.2.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.3.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.3.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.3.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.3.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.3.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.3.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.3.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.3.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.3.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.3.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.3.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.3.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.4.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.4.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.4.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.4.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.4.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.4.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.4.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.4.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.4.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.4.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.4.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.4.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.5.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.5.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.5.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.5.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.5.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.5.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.5.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.5.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.5.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.5.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.5.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.5.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.6.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.6.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.6.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.6.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.6.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.6.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.6.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.6.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.6.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.6.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.6.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.6.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.7.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.7.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.7.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.7.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.7.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.7.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.7.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.7.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.7.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.7.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.7.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.7.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.8.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.8.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.8.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.8.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.8.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.8.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.8.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.8.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.8.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.8.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.8.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.8.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.9.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.9.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.9.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.9.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.9.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.9.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.9.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.9.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.9.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.9.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.9.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.9.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.10.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.10.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.10.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.10.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.10.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.10.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.10.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.10.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.10.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.10.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.10.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.10.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.11.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.11.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.11.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.11.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.11.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.11.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.11.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.11.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.11.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.11.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.11.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.11.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.norm.weight', 'cond_stage_model.model.visual.trunk.norm.bias', 'cond_stage_model.model.visual.head.proj.weight', 'cond_stage_model.model.text.transformer.embeddings.word_embeddings.weight', 'cond_stage_model.model.text.transformer.embeddings.position_embeddings.weight', 'cond_stage_model.model.text.transformer.embeddings.token_type_embeddings.weight', 'cond_stage_model.model.text.transformer.embeddings.LayerNorm.weight', 'cond_stage_model.model.text.transformer.embeddings.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.output.LayerNorm.bias', 'cond_stage_model.model.text.proj.0.weight', 'cond_stage_model.model.text.proj.2.weight'] +Unexpected Keys: ['model_ema.decay', 'model_ema.num_updates', 'cond_stage_model.transformer.text_model.embeddings.position_ids', 'cond_stage_model.transformer.text_model.embeddings.token_embedding.weight', 'cond_stage_model.transformer.text_model.embeddings.position_embedding.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm2.bias', 'cond_stage_model.transformer.text_model.final_layer_norm.weight', 'cond_stage_model.transformer.text_model.final_layer_norm.bias'] +Keeping EMAs of 688. +Merged modelckpt-cfg: +{'target': 'pytorch_lightning.callbacks.ModelCheckpoint', 'params': {'dirpath': 'logs/train_instructpix2pix_2d_random/checkpoints', 'filename': '{epoch:06}', 'verbose': True, 'save_last': True}} +Caution: Saving checkpoints every n train steps without deleting. This might require some free space. +[patch] torch.load monkey-patched for legacy Lightning/NumPy ckpt + 0it [00:00, ?it/s] 6342it [00:00, 1286031.52it/s] +5073 + 0it [00:00, ?it/s]/root/miniconda3/envs/med/lib/python3.9/site-packages/huggingface_hub/file_download.py:943: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. + warnings.warn( + 363it [00:00, 3621.85it/s] 370it [00:00, 3614.98it/s] + 0it [00:00, ?it/s] 6342it [00:00, 1188998.57it/s] +5073 + 0it [00:00, ?it/s] 360it [00:00, 3596.42it/s] 370it [00:00, 3589.77it/s] +initializing ddp: GLOBAL_RANK: 2, MEMBER: 3/8 +/root/miniconda3/envs/med/lib/python3.9/site-packages/transformers/utils/generic.py:441: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + _torch_pytree._register_pytree_node( +[patch] torch.load monkey-patched for legacy Lightning/NumPy ckpt +Running on GPUs 0,1,2,3,4,5,6,7 +LatentDiffusion: Running in eps-prediction mode +DiffusionWrapper has 859.53 M params. +making attention of type 'vanilla' with 512 in_channels +Working with z of shape (1, 4, 32, 32) = 4096 dimensions. +making attention of type 'vanilla' with 512 in_channels +Manual init: model.diffusion_model.input_blocks.0.0.weight +Deleting key model.diffusion_model.input_blocks.0.0.weight from state_dict. +Restored from stable_diffusion/models/ldm/stable-diffusion-v1/v1-5-pruned-emaonly.ckpt with 352 missing and 199 unexpected keys +Missing Keys: ['model.diffusion_model.input_blocks.0.0.weight', 'cond_stage_model.model.logit_scale', 'cond_stage_model.model.visual.trunk.cls_token', 'cond_stage_model.model.visual.trunk.pos_embed', 'cond_stage_model.model.visual.trunk.patch_embed.proj.weight', 'cond_stage_model.model.visual.trunk.patch_embed.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.0.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.0.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.0.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.0.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.0.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.0.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.0.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.0.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.0.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.0.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.0.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.0.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.1.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.1.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.1.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.1.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.1.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.1.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.1.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.1.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.1.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.1.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.1.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.1.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.2.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.2.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.2.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.2.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.2.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.2.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.2.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.2.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.2.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.2.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.2.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.2.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.3.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.3.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.3.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.3.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.3.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.3.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.3.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.3.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.3.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.3.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.3.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.3.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.4.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.4.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.4.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.4.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.4.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.4.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.4.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.4.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.4.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.4.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.4.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.4.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.5.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.5.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.5.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.5.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.5.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.5.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.5.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.5.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.5.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.5.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.5.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.5.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.6.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.6.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.6.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.6.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.6.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.6.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.6.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.6.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.6.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.6.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.6.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.6.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.7.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.7.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.7.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.7.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.7.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.7.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.7.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.7.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.7.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.7.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.7.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.7.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.8.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.8.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.8.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.8.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.8.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.8.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.8.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.8.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.8.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.8.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.8.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.8.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.9.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.9.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.9.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.9.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.9.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.9.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.9.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.9.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.9.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.9.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.9.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.9.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.10.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.10.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.10.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.10.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.10.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.10.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.10.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.10.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.10.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.10.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.10.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.10.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.11.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.11.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.11.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.11.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.11.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.11.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.11.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.11.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.11.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.11.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.11.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.11.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.norm.weight', 'cond_stage_model.model.visual.trunk.norm.bias', 'cond_stage_model.model.visual.head.proj.weight', 'cond_stage_model.model.text.transformer.embeddings.word_embeddings.weight', 'cond_stage_model.model.text.transformer.embeddings.position_embeddings.weight', 'cond_stage_model.model.text.transformer.embeddings.token_type_embeddings.weight', 'cond_stage_model.model.text.transformer.embeddings.LayerNorm.weight', 'cond_stage_model.model.text.transformer.embeddings.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.output.LayerNorm.bias', 'cond_stage_model.model.text.proj.0.weight', 'cond_stage_model.model.text.proj.2.weight'] +Unexpected Keys: ['model_ema.decay', 'model_ema.num_updates', 'cond_stage_model.transformer.text_model.embeddings.position_ids', 'cond_stage_model.transformer.text_model.embeddings.token_embedding.weight', 'cond_stage_model.transformer.text_model.embeddings.position_embedding.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm2.bias', 'cond_stage_model.transformer.text_model.final_layer_norm.weight', 'cond_stage_model.transformer.text_model.final_layer_norm.bias'] +Keeping EMAs of 688. +Merged modelckpt-cfg: +{'target': 'pytorch_lightning.callbacks.ModelCheckpoint', 'params': {'dirpath': 'logs/train_instructpix2pix_2d_random/checkpoints', 'filename': '{epoch:06}', 'verbose': True, 'save_last': True}} +Caution: Saving checkpoints every n train steps without deleting. This might require some free space. +[patch] torch.load monkey-patched for legacy Lightning/NumPy ckpt + 0it [00:00, ?it/s] 6342it [00:00, 1299666.58it/s] +5073 + 0it [00:00, ?it/s] 359it [00:00, 3581.06it/s] 370it [00:00, 3575.76it/s] + 0it [00:00, ?it/s]/root/miniconda3/envs/med/lib/python3.9/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + _torch_pytree._register_pytree_node( + 6342it [00:00, 1228593.41it/s] +5073 + 0it [00:00, ?it/s] 357it [00:00, 3565.20it/s] 370it [00:00, 3540.02it/s] +initializing ddp: GLOBAL_RANK: 3, MEMBER: 4/8 +/root/miniconda3/envs/med/lib/python3.9/site-packages/timm/models/layers/__init__.py:48: FutureWarning: Importing from timm.models.layers is deprecated, please import via timm.layers + warnings.warn(f"Importing from {__name__} is deprecated, please import via timm.layers", FutureWarning) +/root/miniconda3/envs/med/lib/python3.9/site-packages/huggingface_hub/file_download.py:943: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. + warnings.warn( +/root/miniconda3/envs/med/lib/python3.9/site-packages/transformers/utils/generic.py:441: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + _torch_pytree._register_pytree_node( +initializing ddp: GLOBAL_RANK: 0, MEMBER: 1/8 +[patch] torch.load monkey-patched for legacy Lightning/NumPy ckpt +Running on GPUs 0,1,2,3,4,5,6,7 +LatentDiffusion: Running in eps-prediction mode +DiffusionWrapper has 859.53 M params. +making attention of type 'vanilla' with 512 in_channels +Working with z of shape (1, 4, 32, 32) = 4096 dimensions. +making attention of type 'vanilla' with 512 in_channels +Manual init: model.diffusion_model.input_blocks.0.0.weight +Deleting key model.diffusion_model.input_blocks.0.0.weight from state_dict. +Restored from stable_diffusion/models/ldm/stable-diffusion-v1/v1-5-pruned-emaonly.ckpt with 352 missing and 199 unexpected keys +Missing Keys: ['model.diffusion_model.input_blocks.0.0.weight', 'cond_stage_model.model.logit_scale', 'cond_stage_model.model.visual.trunk.cls_token', 'cond_stage_model.model.visual.trunk.pos_embed', 'cond_stage_model.model.visual.trunk.patch_embed.proj.weight', 'cond_stage_model.model.visual.trunk.patch_embed.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.0.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.0.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.0.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.0.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.0.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.0.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.0.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.0.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.0.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.0.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.0.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.0.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.1.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.1.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.1.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.1.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.1.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.1.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.1.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.1.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.1.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.1.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.1.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.1.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.2.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.2.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.2.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.2.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.2.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.2.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.2.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.2.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.2.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.2.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.2.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.2.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.3.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.3.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.3.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.3.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.3.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.3.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.3.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.3.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.3.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.3.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.3.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.3.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.4.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.4.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.4.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.4.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.4.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.4.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.4.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.4.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.4.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.4.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.4.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.4.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.5.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.5.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.5.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.5.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.5.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.5.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.5.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.5.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.5.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.5.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.5.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.5.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.6.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.6.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.6.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.6.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.6.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.6.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.6.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.6.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.6.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.6.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.6.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.6.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.7.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.7.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.7.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.7.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.7.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.7.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.7.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.7.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.7.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.7.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.7.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.7.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.8.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.8.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.8.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.8.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.8.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.8.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.8.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.8.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.8.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.8.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.8.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.8.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.9.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.9.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.9.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.9.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.9.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.9.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.9.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.9.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.9.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.9.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.9.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.9.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.10.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.10.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.10.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.10.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.10.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.10.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.10.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.10.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.10.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.10.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.10.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.10.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.11.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.11.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.11.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.11.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.11.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.11.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.11.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.11.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.11.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.11.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.11.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.11.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.norm.weight', 'cond_stage_model.model.visual.trunk.norm.bias', 'cond_stage_model.model.visual.head.proj.weight', 'cond_stage_model.model.text.transformer.embeddings.word_embeddings.weight', 'cond_stage_model.model.text.transformer.embeddings.position_embeddings.weight', 'cond_stage_model.model.text.transformer.embeddings.token_type_embeddings.weight', 'cond_stage_model.model.text.transformer.embeddings.LayerNorm.weight', 'cond_stage_model.model.text.transformer.embeddings.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.output.LayerNorm.bias', 'cond_stage_model.model.text.proj.0.weight', 'cond_stage_model.model.text.proj.2.weight'] +Unexpected Keys: ['model_ema.decay', 'model_ema.num_updates', 'cond_stage_model.transformer.text_model.embeddings.position_ids', 'cond_stage_model.transformer.text_model.embeddings.token_embedding.weight', 'cond_stage_model.transformer.text_model.embeddings.position_embedding.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm2.bias', 'cond_stage_model.transformer.text_model.final_layer_norm.weight', 'cond_stage_model.transformer.text_model.final_layer_norm.bias'] +Keeping EMAs of 688. +Merged modelckpt-cfg: +{'target': 'pytorch_lightning.callbacks.ModelCheckpoint', 'params': {'dirpath': 'logs/train_instructpix2pix_2d_random/checkpoints', 'filename': '{epoch:06}', 'verbose': True, 'save_last': True}} +Caution: Saving checkpoints every n train steps without deleting. This might require some free space. +[patch] torch.load monkey-patched for legacy Lightning/NumPy ckpt + 0it [00:00, ?it/s] 6342it [00:00, 783444.05it/s] +5073 + 0it [00:00, ?it/s] 300it [00:00, 2992.80it/s] 370it [00:00, 3084.48it/s] + 0it [00:00, ?it/s] 6342it [00:00, 1217515.38it/s] +5073 + 0it [00:00, ?it/s] 343it [00:00, 3422.61it/s] 370it [00:00, 3423.77it/s] +initializing ddp: GLOBAL_RANK: 4, MEMBER: 5/8 +/root/miniconda3/envs/med/lib/python3.9/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + _torch_pytree._register_pytree_node( +/root/miniconda3/envs/med/lib/python3.9/site-packages/timm/models/layers/__init__.py:48: FutureWarning: Importing from timm.models.layers is deprecated, please import via timm.layers + warnings.warn(f"Importing from {__name__} is deprecated, please import via timm.layers", FutureWarning) +/root/miniconda3/envs/med/lib/python3.9/site-packages/huggingface_hub/file_download.py:943: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. + warnings.warn( +/root/miniconda3/envs/med/lib/python3.9/site-packages/transformers/utils/generic.py:309: FutureWarning: `torch.utils._pytree._register_pytree_node` is deprecated. Please use `torch.utils._pytree.register_pytree_node` instead. + _torch_pytree._register_pytree_node( +/root/miniconda3/envs/med/lib/python3.9/site-packages/timm/models/layers/__init__.py:48: FutureWarning: Importing from timm.models.layers is deprecated, please import via timm.layers + warnings.warn(f"Importing from {__name__} is deprecated, please import via timm.layers", FutureWarning) +/root/miniconda3/envs/med/lib/python3.9/site-packages/huggingface_hub/file_download.py:943: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. + warnings.warn( +[patch] torch.load monkey-patched for legacy Lightning/NumPy ckpt +Running on GPUs 0,1,2,3,4,5,6,7 +LatentDiffusion: Running in eps-prediction mode +DiffusionWrapper has 859.53 M params. +making attention of type 'vanilla' with 512 in_channels +Working with z of shape (1, 4, 32, 32) = 4096 dimensions. +making attention of type 'vanilla' with 512 in_channels +Manual init: model.diffusion_model.input_blocks.0.0.weight +Deleting key model.diffusion_model.input_blocks.0.0.weight from state_dict. +Restored from stable_diffusion/models/ldm/stable-diffusion-v1/v1-5-pruned-emaonly.ckpt with 352 missing and 199 unexpected keys +Missing Keys: ['model.diffusion_model.input_blocks.0.0.weight', 'cond_stage_model.model.logit_scale', 'cond_stage_model.model.visual.trunk.cls_token', 'cond_stage_model.model.visual.trunk.pos_embed', 'cond_stage_model.model.visual.trunk.patch_embed.proj.weight', 'cond_stage_model.model.visual.trunk.patch_embed.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.0.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.0.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.0.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.0.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.0.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.0.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.0.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.0.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.0.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.0.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.0.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.0.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.1.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.1.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.1.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.1.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.1.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.1.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.1.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.1.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.1.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.1.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.1.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.1.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.2.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.2.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.2.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.2.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.2.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.2.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.2.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.2.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.2.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.2.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.2.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.2.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.3.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.3.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.3.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.3.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.3.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.3.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.3.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.3.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.3.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.3.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.3.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.3.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.4.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.4.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.4.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.4.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.4.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.4.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.4.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.4.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.4.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.4.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.4.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.4.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.5.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.5.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.5.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.5.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.5.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.5.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.5.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.5.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.5.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.5.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.5.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.5.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.6.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.6.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.6.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.6.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.6.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.6.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.6.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.6.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.6.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.6.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.6.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.6.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.7.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.7.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.7.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.7.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.7.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.7.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.7.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.7.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.7.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.7.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.7.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.7.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.8.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.8.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.8.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.8.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.8.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.8.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.8.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.8.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.8.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.8.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.8.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.8.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.9.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.9.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.9.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.9.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.9.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.9.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.9.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.9.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.9.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.9.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.9.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.9.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.10.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.10.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.10.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.10.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.10.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.10.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.10.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.10.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.10.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.10.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.10.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.10.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.11.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.11.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.11.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.11.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.11.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.11.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.11.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.11.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.11.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.11.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.11.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.11.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.norm.weight', 'cond_stage_model.model.visual.trunk.norm.bias', 'cond_stage_model.model.visual.head.proj.weight', 'cond_stage_model.model.text.transformer.embeddings.word_embeddings.weight', 'cond_stage_model.model.text.transformer.embeddings.position_embeddings.weight', 'cond_stage_model.model.text.transformer.embeddings.token_type_embeddings.weight', 'cond_stage_model.model.text.transformer.embeddings.LayerNorm.weight', 'cond_stage_model.model.text.transformer.embeddings.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.output.LayerNorm.bias', 'cond_stage_model.model.text.proj.0.weight', 'cond_stage_model.model.text.proj.2.weight'] +Unexpected Keys: ['model_ema.decay', 'model_ema.num_updates', 'cond_stage_model.transformer.text_model.embeddings.position_ids', 'cond_stage_model.transformer.text_model.embeddings.token_embedding.weight', 'cond_stage_model.transformer.text_model.embeddings.position_embedding.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm2.bias', 'cond_stage_model.transformer.text_model.final_layer_norm.weight', 'cond_stage_model.transformer.text_model.final_layer_norm.bias'] +Keeping EMAs of 688. +Merged modelckpt-cfg: +{'target': 'pytorch_lightning.callbacks.ModelCheckpoint', 'params': {'dirpath': 'logs/train_instructpix2pix_2d_random/checkpoints', 'filename': '{epoch:06}', 'verbose': True, 'save_last': True}} +Caution: Saving checkpoints every n train steps without deleting. This might require some free space. +[patch] torch.load monkey-patched for legacy Lightning/NumPy ckpt + 0it [00:00, ?it/s] 6342it [00:00, 1278367.74it/s] +5073 + 0it [00:00, ?it/s] 360it [00:00, 3599.42it/s] 370it [00:00, 3590.66it/s] + 0it [00:00, ?it/s] 6342it [00:00, 1203142.43it/s] +5073 + 0it [00:00, ?it/s] 363it [00:00, 3622.39it/s] 370it [00:00, 3616.71it/s] +initializing ddp: GLOBAL_RANK: 5, MEMBER: 6/8 +[patch] torch.load monkey-patched for legacy Lightning/NumPy ckpt +Running on GPUs 0,1,2,3,4,5,6,7 +LatentDiffusion: Running in eps-prediction mode +DiffusionWrapper has 859.53 M params. +making attention of type 'vanilla' with 512 in_channels +Working with z of shape (1, 4, 32, 32) = 4096 dimensions. +making attention of type 'vanilla' with 512 in_channels +Manual init: model.diffusion_model.input_blocks.0.0.weight +Deleting key model.diffusion_model.input_blocks.0.0.weight from state_dict. +Restored from stable_diffusion/models/ldm/stable-diffusion-v1/v1-5-pruned-emaonly.ckpt with 352 missing and 199 unexpected keys +Missing Keys: ['model.diffusion_model.input_blocks.0.0.weight', 'cond_stage_model.model.logit_scale', 'cond_stage_model.model.visual.trunk.cls_token', 'cond_stage_model.model.visual.trunk.pos_embed', 'cond_stage_model.model.visual.trunk.patch_embed.proj.weight', 'cond_stage_model.model.visual.trunk.patch_embed.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.0.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.0.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.0.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.0.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.0.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.0.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.0.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.0.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.0.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.0.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.0.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.0.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.1.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.1.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.1.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.1.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.1.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.1.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.1.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.1.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.1.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.1.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.1.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.1.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.2.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.2.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.2.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.2.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.2.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.2.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.2.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.2.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.2.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.2.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.2.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.2.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.3.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.3.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.3.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.3.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.3.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.3.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.3.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.3.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.3.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.3.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.3.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.3.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.4.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.4.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.4.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.4.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.4.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.4.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.4.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.4.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.4.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.4.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.4.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.4.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.5.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.5.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.5.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.5.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.5.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.5.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.5.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.5.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.5.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.5.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.5.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.5.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.6.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.6.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.6.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.6.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.6.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.6.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.6.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.6.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.6.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.6.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.6.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.6.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.7.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.7.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.7.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.7.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.7.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.7.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.7.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.7.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.7.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.7.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.7.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.7.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.8.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.8.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.8.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.8.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.8.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.8.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.8.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.8.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.8.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.8.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.8.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.8.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.9.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.9.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.9.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.9.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.9.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.9.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.9.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.9.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.9.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.9.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.9.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.9.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.10.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.10.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.10.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.10.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.10.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.10.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.10.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.10.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.10.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.10.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.10.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.10.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.11.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.11.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.11.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.11.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.11.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.11.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.11.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.11.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.11.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.11.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.11.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.11.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.norm.weight', 'cond_stage_model.model.visual.trunk.norm.bias', 'cond_stage_model.model.visual.head.proj.weight', 'cond_stage_model.model.text.transformer.embeddings.word_embeddings.weight', 'cond_stage_model.model.text.transformer.embeddings.position_embeddings.weight', 'cond_stage_model.model.text.transformer.embeddings.token_type_embeddings.weight', 'cond_stage_model.model.text.transformer.embeddings.LayerNorm.weight', 'cond_stage_model.model.text.transformer.embeddings.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.output.LayerNorm.bias', 'cond_stage_model.model.text.proj.0.weight', 'cond_stage_model.model.text.proj.2.weight'] +Unexpected Keys: ['model_ema.decay', 'model_ema.num_updates', 'cond_stage_model.transformer.text_model.embeddings.position_ids', 'cond_stage_model.transformer.text_model.embeddings.token_embedding.weight', 'cond_stage_model.transformer.text_model.embeddings.position_embedding.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm2.bias', 'cond_stage_model.transformer.text_model.final_layer_norm.weight', 'cond_stage_model.transformer.text_model.final_layer_norm.bias'] +Keeping EMAs of 688. +Merged modelckpt-cfg: +{'target': 'pytorch_lightning.callbacks.ModelCheckpoint', 'params': {'dirpath': 'logs/train_instructpix2pix_2d_random/checkpoints', 'filename': '{epoch:06}', 'verbose': True, 'save_last': True}} +Caution: Saving checkpoints every n train steps without deleting. This might require some free space. +[patch] torch.load monkey-patched for legacy Lightning/NumPy ckpt + 0it [00:00, ?it/s] 6342it [00:00, 1289085.34it/s] +5073 + 0it [00:00, ?it/s] 361it [00:00, 3603.06it/s] 370it [00:00, 3597.11it/s] + 0it [00:00, ?it/s] 6342it [00:00, 1213350.18it/s] +5073 + 0it [00:00, ?it/s] 364it [00:00, 3630.13it/s] 370it [00:00, 3625.17it/s] +initializing ddp: GLOBAL_RANK: 6, MEMBER: 7/8 +[patch] torch.load monkey-patched for legacy Lightning/NumPy ckpt +Running on GPUs 0,1,2,3,4,5,6,7 +LatentDiffusion: Running in eps-prediction mode +DiffusionWrapper has 859.53 M params. +making attention of type 'vanilla' with 512 in_channels +Working with z of shape (1, 4, 32, 32) = 4096 dimensions. +making attention of type 'vanilla' with 512 in_channels +Manual init: model.diffusion_model.input_blocks.0.0.weight +Deleting key model.diffusion_model.input_blocks.0.0.weight from state_dict. +Restored from stable_diffusion/models/ldm/stable-diffusion-v1/v1-5-pruned-emaonly.ckpt with 352 missing and 199 unexpected keys +Missing Keys: ['model.diffusion_model.input_blocks.0.0.weight', 'cond_stage_model.model.logit_scale', 'cond_stage_model.model.visual.trunk.cls_token', 'cond_stage_model.model.visual.trunk.pos_embed', 'cond_stage_model.model.visual.trunk.patch_embed.proj.weight', 'cond_stage_model.model.visual.trunk.patch_embed.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.0.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.0.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.0.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.0.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.0.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.0.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.0.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.0.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.0.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.0.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.0.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.0.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.1.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.1.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.1.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.1.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.1.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.1.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.1.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.1.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.1.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.1.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.1.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.1.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.2.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.2.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.2.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.2.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.2.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.2.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.2.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.2.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.2.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.2.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.2.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.2.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.3.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.3.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.3.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.3.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.3.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.3.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.3.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.3.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.3.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.3.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.3.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.3.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.4.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.4.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.4.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.4.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.4.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.4.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.4.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.4.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.4.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.4.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.4.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.4.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.5.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.5.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.5.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.5.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.5.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.5.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.5.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.5.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.5.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.5.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.5.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.5.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.6.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.6.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.6.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.6.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.6.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.6.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.6.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.6.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.6.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.6.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.6.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.6.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.7.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.7.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.7.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.7.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.7.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.7.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.7.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.7.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.7.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.7.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.7.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.7.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.8.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.8.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.8.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.8.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.8.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.8.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.8.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.8.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.8.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.8.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.8.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.8.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.9.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.9.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.9.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.9.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.9.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.9.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.9.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.9.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.9.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.9.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.9.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.9.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.10.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.10.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.10.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.10.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.10.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.10.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.10.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.10.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.10.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.10.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.10.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.10.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.blocks.11.norm1.weight', 'cond_stage_model.model.visual.trunk.blocks.11.norm1.bias', 'cond_stage_model.model.visual.trunk.blocks.11.attn.qkv.weight', 'cond_stage_model.model.visual.trunk.blocks.11.attn.qkv.bias', 'cond_stage_model.model.visual.trunk.blocks.11.attn.proj.weight', 'cond_stage_model.model.visual.trunk.blocks.11.attn.proj.bias', 'cond_stage_model.model.visual.trunk.blocks.11.norm2.weight', 'cond_stage_model.model.visual.trunk.blocks.11.norm2.bias', 'cond_stage_model.model.visual.trunk.blocks.11.mlp.fc1.weight', 'cond_stage_model.model.visual.trunk.blocks.11.mlp.fc1.bias', 'cond_stage_model.model.visual.trunk.blocks.11.mlp.fc2.weight', 'cond_stage_model.model.visual.trunk.blocks.11.mlp.fc2.bias', 'cond_stage_model.model.visual.trunk.norm.weight', 'cond_stage_model.model.visual.trunk.norm.bias', 'cond_stage_model.model.visual.head.proj.weight', 'cond_stage_model.model.text.transformer.embeddings.word_embeddings.weight', 'cond_stage_model.model.text.transformer.embeddings.position_embeddings.weight', 'cond_stage_model.model.text.transformer.embeddings.token_type_embeddings.weight', 'cond_stage_model.model.text.transformer.embeddings.LayerNorm.weight', 'cond_stage_model.model.text.transformer.embeddings.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.0.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.0.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.1.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.1.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.2.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.2.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.3.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.3.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.4.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.4.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.5.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.5.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.6.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.6.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.7.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.7.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.8.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.8.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.9.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.9.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.10.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.10.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.query.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.query.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.key.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.key.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.value.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.self.value.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.attention.output.LayerNorm.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.intermediate.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.intermediate.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.output.dense.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.output.dense.bias', 'cond_stage_model.model.text.transformer.encoder.layer.11.output.LayerNorm.weight', 'cond_stage_model.model.text.transformer.encoder.layer.11.output.LayerNorm.bias', 'cond_stage_model.model.text.proj.0.weight', 'cond_stage_model.model.text.proj.2.weight'] +Unexpected Keys: ['model_ema.decay', 'model_ema.num_updates', 'cond_stage_model.transformer.text_model.embeddings.position_ids', 'cond_stage_model.transformer.text_model.embeddings.token_embedding.weight', 'cond_stage_model.transformer.text_model.embeddings.position_embedding.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.k_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.k_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.v_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.v_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.q_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.q_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.out_proj.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.out_proj.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc1.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc1.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc2.bias', 'cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm2.weight', 'cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm2.bias', 'cond_stage_model.transformer.text_model.final_layer_norm.weight', 'cond_stage_model.transformer.text_model.final_layer_norm.bias'] +Keeping EMAs of 688. +Merged modelckpt-cfg: +{'target': 'pytorch_lightning.callbacks.ModelCheckpoint', 'params': {'dirpath': 'logs/train_instructpix2pix_2d_random/checkpoints', 'filename': '{epoch:06}', 'verbose': True, 'save_last': True}} +Caution: Saving checkpoints every n train steps without deleting. This might require some free space. +[patch] torch.load monkey-patched for legacy Lightning/NumPy ckpt + 0it [00:00, ?it/s] 6342it [00:00, 1267102.18it/s] +5073 + 0it [00:00, ?it/s] 361it [00:00, 3602.75it/s] 370it [00:00, 3594.28it/s] + 0it [00:00, ?it/s] 6342it [00:00, 1219245.36it/s] +5073 + 0it [00:00, ?it/s] 364it [00:00, 3636.73it/s] 370it [00:00, 3629.77it/s] +initializing ddp: GLOBAL_RANK: 7, MEMBER: 8/8 +---------------------------------------------------------------------------------------------------- +distributed_backend=nccl +All DDP processes registered. Starting ddp with 8 processes +---------------------------------------------------------------------------------------------------- + +/root/miniconda3/envs/med/lib/python3.9/site-packages/pytorch_lightning/core/datamodule.py:423: LightningDeprecationWarning: DataModule.setup has already been called, so it will not be called again. In v1.6 this behavior will change to always call DataModule.setup. + rank_zero_deprecation( +LOCAL_RANK: 7 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7] +LOCAL_RANK: 3 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7] +LOCAL_RANK: 5 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7] +LOCAL_RANK: 4 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7] +LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7] +LOCAL_RANK: 2 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7] +LOCAL_RANK: 6 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7] +LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7] + + | Name | Type | Params +--------------------------------------------------------------- +0 | model | DiffusionWrapper | 859 M +1 | first_stage_model | AutoencoderKL | 83.7 M +2 | cond_stage_model | FrozenBioMedCLIPEmbedder | 195 M +3 | model_ema | LitEma | 0 +--------------------------------------------------------------- +859 M Trainable params +279 M Non-trainable params +1.1 B Total params +4,556.356 Total estimated model params size (MB) +#### Data ##### +train, CTReportDataset, 5073 +validation, CTReportDatasetinfer, 40 +accumulate_grad_batches = 4 +++++ NOT USING LR SCALING ++++ +Setting learning rate to 1.00e-04 +Setting up LambdaLR scheduler... +Project config +model: + base_learning_rate: 0.0001 + target: ldm.models.diffusion.ddpm_edit.LatentDiffusion + params: + ckpt_path: stable_diffusion/models/ldm/stable-diffusion-v1/v1-5-pruned-emaonly.ckpt + linear_start: 0.00085 + linear_end: 0.012 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: edited + cond_stage_key: edit + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: hybrid + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: true + load_ema: false + scheduler_config: + target: ldm.lr_scheduler.LambdaLinearScheduler + params: + warm_up_steps: + - 0 + cycle_lengths: + - 10000000000000 + f_start: + - 1.0e-06 + f_max: + - 1.0 + f_min: + - 1.0 + unet_config: + target: ldm.modules.diffusionmodules.openaimodel.UNetModel + params: + image_size: 32 + in_channels: 8 + out_channels: 4 + model_channels: 320 + attention_resolutions: + - 4 + - 2 + - 1 + num_res_blocks: 2 + channel_mult: + - 1 + - 2 + - 4 + - 4 + num_heads: 8 + use_spatial_transformer: true + transformer_depth: 1 + context_dim: 768 + use_checkpoint: true + legacy: false + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenBioMedCLIPEmbedder +data: + target: main.DataModuleFromConfig + params: + batch_size: 8 + num_workers: 8 + train: + target: ldm.data.ct_clip_data_train.CTReportDataset + params: + data_folder: /workspace/jifu/data/dataset/train_fixed + csv_file: /workspace/jifu/data/data_json/radiology_text_reports/train_reports.csv + validation: + target: ldm.data.ct_clip_data_inference.CTReportDatasetinfer + params: + data_folder: /workspace/jifu/data/dataset/valid_fixed + csv_file: /workspace/jifu/data/data_json/radiology_text_reports/valid_reports.csv + labels: /workspace/jifu/data/data_json/multi_abnormality_labels/valid_predicted_labels.csv + +Lightning config +callbacks: + image_logger: + target: main.ImageLogger + params: + batch_frequency: 200000000 + max_images: 2 + increase_log_steps: false +trainer: + max_epochs: 2000 + benchmark: true + accumulate_grad_batches: 4 + check_val_every_n_epoch: 1000000 + accelerator: ddp + gpus: 0,1,2,3,4,5,6,7 + + Validation sanity check: 0it [00:00, ?it/s] Validation sanity check: 0%| | 0/1 [00:00