| { | |
| "backbone_config": { | |
| "model_type": "sapiens2_backbone", | |
| "arch": "sapiens2_0.4b", | |
| "embed_dims": 1024, | |
| "num_layers": 24, | |
| "num_heads": 16, | |
| "feedforward_channels": 4096, | |
| "image_size": [ | |
| 1024, | |
| 768 | |
| ], | |
| "patch_size": 16, | |
| "n_storage_tokens": 8, | |
| "rope_base": 100.0, | |
| "rope_normalize_coords": "separate", | |
| "mhsa_early": 8, | |
| "mhsa_late": 8, | |
| "layer_scale_init_value": 0.0001, | |
| "final_norm": true | |
| }, | |
| "head_config": { | |
| "model_type": "sapiens2_head", | |
| "task": "seg", | |
| "in_channels": 1024, | |
| "deconv_out_channels": [ | |
| 512, | |
| 256, | |
| 128, | |
| 64 | |
| ], | |
| "deconv_kernel_sizes": [ | |
| 4, | |
| 4, | |
| 4, | |
| 4 | |
| ], | |
| "conv_out_channels": [ | |
| 64, | |
| 64 | |
| ], | |
| "conv_kernel_sizes": [ | |
| 1, | |
| 1 | |
| ], | |
| "num_keypoints": 308, | |
| "num_classes": 29, | |
| "upsample_channels": null, | |
| "scale_conv_out_channels": null, | |
| "scale_conv_kernel_sizes": null, | |
| "scale_final_layer": null | |
| }, | |
| "model_type": "sapiens2", | |
| "quantization": { | |
| "group_size": 64, | |
| "bits": 4, | |
| "mode": "affine" | |
| }, | |
| "quantization_config": { | |
| "group_size": 64, | |
| "bits": 4, | |
| "mode": "affine" | |
| }, | |
| "size": "0.4b", | |
| "task": "seg", | |
| "vision_config": {} | |
| } |