xishushu commited on
Commit
9790b8f
·
0 Parent(s):

Add SyncHuman model checkpoints

Browse files
.gitattributes ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ *.tar.gz filter=lfs diff=lfs merge=lfs -text
2
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.pth filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.zip filter=lfs diff=lfs merge=lfs -text
ckpts/OneStage/SyncHuman_2D3DCrossSpaceDiffusion/config.json ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "SyncHuman_2D3DCrossSpaceDiffusion",
3
+ "_diffusers_version": "0.29.1",
4
+ "act_fn": "silu",
5
+ "addition_channels": [
6
+ 1280,
7
+ 1280,
8
+ 1280
9
+ ],
10
+ "addition_downsample": false,
11
+ "addition_embed_type": null,
12
+ "addition_embed_type_num_heads": 64,
13
+ "addition_time_embed_dim": null,
14
+ "attention_head_dim": [
15
+ 5,
16
+ 10,
17
+ 20,
18
+ 20
19
+ ],
20
+ "block_out_channels": [
21
+ 320,
22
+ 640,
23
+ 1280,
24
+ 1280
25
+ ],
26
+ "cd_attention_last": false,
27
+ "cd_attention_mid": false,
28
+ "center_input_sample": false,
29
+ "class_embed_type": "projection",
30
+ "class_embeddings_concat": false,
31
+ "cond_channels": 1024,
32
+ "conv_in_kernel": 3,
33
+ "conv_out_kernel": 3,
34
+ "cross_attention_dim": 1024,
35
+ "cross_attention_norm": null,
36
+ "decay": 0.9999,
37
+ "down_block_types": [
38
+ "CrossAttnDownBlockMV2D",
39
+ "CrossAttnDownBlockMV2D",
40
+ "CrossAttnDownBlockMV2D",
41
+ "DownBlock2D"
42
+ ],
43
+ "downsample_padding": 1,
44
+ "dual_cross_attention": false,
45
+ "encoder_hid_dim": null,
46
+ "encoder_hid_dim_type": null,
47
+ "flip_sin_to_cos": true,
48
+ "freq_shift": 0,
49
+ "in_channels": 8,
50
+ "inv_gamma": 1.0,
51
+ "layers_per_block": 2,
52
+ "mid_block_only_cross_attention": null,
53
+ "mid_block_scale_factor": 1,
54
+ "mid_block_type": "UNetMidBlockMV2DCrossAttn",
55
+ "min_decay": 0.0,
56
+ "mlp_ratio": 4,
57
+ "model_channels": 1024,
58
+ "multiview_attention": true,
59
+ "mv_cond_channels": 640,
60
+ "mvcd_attention": true,
61
+ "norm_eps": 1e-05,
62
+ "norm_num_groups": 32,
63
+ "num_attention_heads": null,
64
+ "num_blocks": 24,
65
+ "num_class_embeds": null,
66
+ "num_dual_CrossAttblocks": 2,
67
+ "num_head_channels": 64,
68
+ "num_heads": 16,
69
+ "num_views": 5,
70
+ "only_cross_attention": false,
71
+ "optimization_step": 42500,
72
+ "out_channels": 4,
73
+ "patch_size": 1,
74
+ "pe_mode": "ape",
75
+ "power": 0.6666666666666666,
76
+ "projection_class_embeddings_input_dim": 2048,
77
+ "qk_rms_norm": true,
78
+ "qk_rms_norm_cross": false,
79
+ "resnet_out_scale_factor": 1.0,
80
+ "resnet_skip_time_act": false,
81
+ "resnet_time_scale_shift": "default",
82
+ "resolution": 16,
83
+ "sample_size": 96,
84
+ "selfattn_block": "self_rowwise",
85
+ "share_mod": false,
86
+ "sparse_mv_attention": true,
87
+ "time_cond_proj_dim": null,
88
+ "time_embedding_act_fn": null,
89
+ "time_embedding_dim": null,
90
+ "time_embedding_type": "positional",
91
+ "timestep_post_act": null,
92
+ "transformer_layers_per_block": 1,
93
+ "trellis_in_channels": 8,
94
+ "trellis_out_channels": 8,
95
+ "up_block_types": [
96
+ "UpBlock2D",
97
+ "CrossAttnUpBlockMV2D",
98
+ "CrossAttnUpBlockMV2D",
99
+ "CrossAttnUpBlockMV2D"
100
+ ],
101
+ "upcast_attention": true,
102
+ "update_after_step": 0,
103
+ "use_checkpoint": false,
104
+ "use_dino": false,
105
+ "use_ema_warmup": false,
106
+ "use_fp16": true,
107
+ "use_linear_projection": true
108
+ }
ckpts/OneStage/SyncHuman_2D3DCrossSpaceDiffusion/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf8835494bf8afad7fe670a675431cee0dad0dbd05f63a4df23313da5fe08705
3
+ size 6814820408
ckpts/OneStage/feature_extractor/preprocessor_config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": {
3
+ "height": 224,
4
+ "width": 224
5
+ },
6
+ "do_center_crop": true,
7
+ "do_convert_rgb": true,
8
+ "do_normalize": true,
9
+ "do_rescale": true,
10
+ "do_resize": true,
11
+ "image_mean": [
12
+ 0.48145466,
13
+ 0.4578275,
14
+ 0.40821073
15
+ ],
16
+ "image_processor_type": "CLIPImageProcessor",
17
+ "image_std": [
18
+ 0.26862954,
19
+ 0.26130258,
20
+ 0.27577711
21
+ ],
22
+ "resample": 3,
23
+ "rescale_factor": 0.00392156862745098,
24
+ "size": {
25
+ "shortest_edge": 224
26
+ }
27
+ }
ckpts/OneStage/image_encoder/config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "stabilityai/stable-diffusion-2-1-unclip",
3
+ "architectures": [
4
+ "CLIPVisionModelWithProjection"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "dropout": 0.0,
8
+ "hidden_act": "gelu",
9
+ "hidden_size": 1280,
10
+ "image_size": 224,
11
+ "initializer_factor": 1.0,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 5120,
14
+ "layer_norm_eps": 1e-05,
15
+ "model_type": "clip_vision_model",
16
+ "num_attention_heads": 16,
17
+ "num_channels": 3,
18
+ "num_hidden_layers": 32,
19
+ "patch_size": 14,
20
+ "projection_dim": 1024,
21
+ "torch_dtype": "float16",
22
+ "transformers_version": "4.36.0"
23
+ }
ckpts/OneStage/image_encoder/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae616c24393dd1854372b0639e5541666f7521cbe219669255e865cb7f89466a
3
+ size 1264217240
ckpts/OneStage/pipeline_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_views": 5,
3
+ "metadata": {
4
+ "feature_extractor": {
5
+ "class": "transformers.models.clip.image_processing_clip.CLIPImageProcessor",
6
+ "subdir": "feature_extractor"
7
+ },
8
+ "image_encoder": {
9
+ "class": "transformers.models.clip.modeling_clip.CLIPVisionModelWithProjection",
10
+ "subdir": "image_encoder"
11
+ },
12
+ "text_encoder": {
13
+ "class": "transformers.models.clip.modeling_clip.CLIPTextModel",
14
+ "subdir": "text_encoder"
15
+ },
16
+ "SyncHuman_2D3DCrossSpaceDiffusion": {
17
+ "class": "SyncHuman.models.OneStage.SyncHuman_2D3DCrossSpaceDiffusion.SyncHuman_2D3DCrossSpaceDiffusion",
18
+ "subdir": "SyncHuman_2D3DCrossSpaceDiffusion"
19
+ },
20
+ "vae": {
21
+ "class": "diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL",
22
+ "subdir": "vae"
23
+ },
24
+ "sparse_structure_decoder": {
25
+ "class": "SyncHuman.models.OneStage.sparse_structure_vae.SparseStructureDecoder",
26
+ "subdir": "sparse_structure_decoder"
27
+ }
28
+ }
29
+ }
ckpts/OneStage/sparse_structure_decoder/config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ {
3
+ "name": "SparseStructureDecoder",
4
+ "args": {
5
+ "out_channels": 1,
6
+ "latent_channels": 8,
7
+ "num_res_blocks": 2,
8
+ "num_res_blocks_middle": 2,
9
+ "channels": [512, 128, 32],
10
+ "use_fp16": true
11
+ }
12
+ }
ckpts/OneStage/sparse_structure_decoder/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70da369b1120a5a267be33ec133f8197f1647fa91fd5b81cd71b13c7031e619d
3
+ size 147368034
ckpts/OneStage/text_encoder/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "stabilityai/stable-diffusion-2-1-unclip",
3
+ "architectures": [
4
+ "CLIPTextModel"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 0,
8
+ "dropout": 0.0,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_size": 1024,
12
+ "initializer_factor": 1.0,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 4096,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 77,
17
+ "model_type": "clip_text_model",
18
+ "num_attention_heads": 16,
19
+ "num_hidden_layers": 23,
20
+ "pad_token_id": 1,
21
+ "projection_dim": 512,
22
+ "torch_dtype": "float16",
23
+ "transformers_version": "4.36.0",
24
+ "vocab_size": 49408
25
+ }
ckpts/OneStage/text_encoder/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc1827c465450322616f06dea41596eac7d493f4e95904dcb51f0fc745c4e13f
3
+ size 680820392
ckpts/OneStage/vae/config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AutoencoderKL",
3
+ "_diffusers_version": "0.29.1",
4
+ "_name_or_path": "stabilityai/stable-diffusion-2-1-unclip",
5
+ "act_fn": "silu",
6
+ "block_out_channels": [
7
+ 128,
8
+ 256,
9
+ 512,
10
+ 512
11
+ ],
12
+ "down_block_types": [
13
+ "DownEncoderBlock2D",
14
+ "DownEncoderBlock2D",
15
+ "DownEncoderBlock2D",
16
+ "DownEncoderBlock2D"
17
+ ],
18
+ "force_upcast": true,
19
+ "in_channels": 3,
20
+ "latent_channels": 4,
21
+ "latents_mean": null,
22
+ "latents_std": null,
23
+ "layers_per_block": 2,
24
+ "norm_num_groups": 32,
25
+ "out_channels": 3,
26
+ "sample_size": 768,
27
+ "scaling_factor": 0.18215,
28
+ "shift_factor": null,
29
+ "up_block_types": [
30
+ "UpDecoderBlock2D",
31
+ "UpDecoderBlock2D",
32
+ "UpDecoderBlock2D",
33
+ "UpDecoderBlock2D"
34
+ ],
35
+ "use_post_quant_conv": true,
36
+ "use_quant_conv": true
37
+ }
ckpts/OneStage/vae/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e4c08995484ee61270175e9e7a072b66a6e4eeb5f0c266667fe1f45b90daf9a
3
+ size 167335342
ckpts/SecondStage/ckpts/decoder_GS/config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "SLatGaussianDecoder",
3
+ "args": {
4
+ "resolution": 64,
5
+ "model_channels": 768,
6
+ "latent_channels": 8,
7
+ "num_blocks": 12,
8
+ "num_heads": 12,
9
+ "mlp_ratio": 4,
10
+ "attn_mode": "swin",
11
+ "window_size": 8,
12
+ "use_fp16": true,
13
+ "mv_condition_mode":"4_view",
14
+ "use_multiscale": true,
15
+ "use_faceinfo":false,
16
+ "representation_config": {
17
+ "lr": {
18
+ "_xyz": 1.0,
19
+ "_features_dc": 1.0,
20
+ "_opacity": 1.0,
21
+ "_scaling": 1.0,
22
+ "_rotation": 0.1
23
+ },
24
+ "perturb_offset": true,
25
+ "voxel_size": 1.5,
26
+ "num_gaussians": 32,
27
+ "2d_filter_kernel_size": 0.1,
28
+ "3d_filter_kernel_size": 9e-4,
29
+ "scaling_bias": 4e-3,
30
+ "opacity_bias": 0.1,
31
+ "scaling_activation": "softplus"
32
+ }
33
+ }
34
+ }
ckpts/SecondStage/ckpts/decoder_GS/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d6138d8e2f0756a67baba2c753019c224222c8ad5bec8f555a57ae8e0af2546
3
+ size 1809690624
ckpts/SecondStage/ckpts/decoder_Mesh/config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "SLatMeshDecoder",
3
+ "args": {
4
+ "resolution": 64,
5
+ "model_channels": 768,
6
+ "latent_channels": 8,
7
+ "num_blocks": 12,
8
+ "num_heads": 12,
9
+ "mlp_ratio": 4,
10
+ "attn_mode": "swin",
11
+ "window_size": 8,
12
+ "use_fp16": true,
13
+ "mv_condition_mode":"4_view",
14
+ "use_multiscale": true,
15
+ "use_faceinfo":false,
16
+ "representation_config": {
17
+ "use_color": true
18
+ }
19
+ }
20
+ }
ckpts/SecondStage/ckpts/decoder_Mesh/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8f1e2088ad3f4f87d2502744cc66c116d239b7dd1b74e5d850dad2aa56afd09
3
+ size 1831922964
ckpts/SecondStage/ckpts/slat_flow/config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "SLatFlowModel",
3
+ "args": {
4
+ "resolution": 64,
5
+ "in_channels": 8,
6
+ "out_channels": 8,
7
+ "model_channels": 1024,
8
+ "cond_channels": 1024,
9
+ "num_blocks": 24,
10
+ "num_heads": 16,
11
+ "mlp_ratio": 4,
12
+ "patch_size": 2,
13
+ "num_io_res_blocks": 2,
14
+ "io_block_channels": [128],
15
+ "pe_mode": "ape",
16
+ "qk_rms_norm": true,
17
+ "use_fp16": true
18
+ }
19
+ }
ckpts/SecondStage/ckpts/slat_flow/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:693fb2a58ad497bd222007301eeec49d14d60f8c12d2f2f00c221fa747b4c66c
3
+ size 1203755136
ckpts/SecondStage/pipeline.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "SyncHumanTwoStagePipeline",
3
+ "args": {
4
+ "models": {
5
+ "slat_decoder_gs": "ckpts/decoder_GS",
6
+ "slat_decoder_mesh": "ckpts/decoder_Mesh",
7
+ "slat_flow_model": "ckpts/slat_flow"
8
+ },
9
+ "slat_sampler": {
10
+ "name": "FlowEulerGuidanceIntervalSampler",
11
+ "args": {
12
+ "sigma_min": 1e-5
13
+ },
14
+ "params": {
15
+ "steps": 25,
16
+ "cfg_strength": 5.0,
17
+ "cfg_interval": [0.5, 1.0],
18
+ "rescale_t": 3.0
19
+ }
20
+ },
21
+ "slat_normalization": {
22
+ "mean": [
23
+ -2.1687545776367188,
24
+ -0.004347046371549368,
25
+ -0.13352349400520325,
26
+ -0.08418072760105133,
27
+ -0.5271206498146057,
28
+ 0.7238689064979553,
29
+ -1.1414450407028198,
30
+ 1.2039363384246826
31
+ ],
32
+ "std": [
33
+ 2.377650737762451,
34
+ 2.386378288269043,
35
+ 2.124418020248413,
36
+ 2.1748552322387695,
37
+ 2.663944721221924,
38
+ 2.371192216873169,
39
+ 2.6217446327209473,
40
+ 2.684523105621338
41
+ ]
42
+ },
43
+ "image_cond_model": "dinov2_vitl14_reg"
44
+ }
45
+ }