| model = dict( |
| backbone=dict( |
| n_points=4, |
| deform_num_heads=16, |
| cffn_ratio=0.25, |
| deform_ratio=0.5, |
| with_cffn=True, |
| interact_attn_type='deform', |
| interaction_drop_path_rate=0.4, |
| separate_head=True, |
| |
| branch1=dict( |
| model_type="augreg", |
| img_size=96, |
| patch_size=16, |
| pretrain_img_size=224, |
| pretrain_patch_size=16, |
| depth=24, |
| embed_dim=1024, |
| num_heads=16, |
| mlp_ratio=4, |
| qkv_bias=True, |
| drop_path_rate=0.4, |
| interaction_indexes=[[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], [12, 13], [14, 15], [16, 17], [18, 19], [20, 21], [22, 23]], |
| use_cls_token=True, |
| use_flash_attn=True, |
| with_cp=True, |
| pretrained="pretrained/L_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.1-sd_0.1--imagenet2012-steps_20k-lr_0.01-res_224.pth", |
| ), |
| |
| branch2=dict( |
| model_type="augreg", |
| img_size=160, |
| patch_size=16, |
| pretrain_img_size=224, |
| pretrain_patch_size=16, |
| depth=12, |
| embed_dim=768, |
| num_heads=12, |
| mlp_ratio=4, |
| qkv_bias=True, |
| drop_path_rate=0.2, |
| interaction_indexes=[[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7], [8, 8], [9, 9], [10, 10], [11, 11]], |
| use_cls_token=True, |
| use_flash_attn=True, |
| with_cp=True, |
| pretrained="pretrained/B_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_224.pth", |
| ), |
| |
| branch3=dict( |
| model_type="augreg", |
| img_size=320, |
| patch_size=16, |
| pretrain_img_size=224, |
| pretrain_patch_size=16, |
| depth=12, |
| embed_dim=384, |
| num_heads=6, |
| mlp_ratio=4, |
| qkv_bias=True, |
| drop_path_rate=0.05, |
| interaction_indexes=[[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7], [8, 8], [9, 9], [10, 10], [11, 11]], |
| use_cls_token=True, |
| use_flash_attn=True, |
| with_cp=True, |
| pretrained="pretrained/S_16-i21k-300ep-lr_0.001-aug_light1-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_224.pth", |
| ), |
| ), |
| ) |