VideoMaMa / sam2_hiera_l.yaml
pizb's picture
sam2 checkpoint update
0b4dee0
# Model Configuration for SAM2
# This file should be placed alongside the SAM2 checkpoint
# SAM 2 Hiera Large Configuration
model:
_target_: sam2.modeling.sam2_base.SAM2Base
image_encoder:
_target_: sam2.modeling.backbones.image_encoder.ImageEncoder
trunk:
_target_: sam2.modeling.backbones.hieradet.Hiera
embed_dim: 144
num_heads: 2
stages: [2, 6, 36, 4]
global_att_blocks: [23, 33, 43]
window_pos_embed_bkg_spatial_size: [7, 7]
window_spec: [8, 4, 16, 8]
neck:
_target_: sam2.modeling.backbones.image_encoder.FpnNeck
position_encoding:
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
num_pos_feats: 256
normalize: true
scale: null
temperature: 10000
d_model: 256
backbone_channel_list: [1152, 576, 288, 144]
fpn_top_down_levels: [2, 3]
fpn_interp_model: nearest
memory_attention:
_target_: sam2.modeling.memory_attention.MemoryAttention
d_model: 256
pos_enc_at_input: true
layer:
_target_: sam2.modeling.memory_attention.MemoryAttentionLayer
activation: relu
dim_feedforward: 2048
dropout: 0.1
pos_enc_at_attn: false
self_attention:
_target_: sam2.modeling.sam.transformer.RoPEAttention
rope_theta: 10000.0
feat_sizes: [32, 32]
embedding_dim: 256
num_heads: 1
downsample_rate: 1
dropout: 0.1
d_model: 256
pos_enc_at_cross_attn_keys: true
pos_enc_at_cross_attn_queries: false
cross_attention:
_target_: sam2.modeling.sam.transformer.RoPEAttention
rope_theta: 10000.0
feat_sizes: [32, 32]
rope_k_repeat: True
embedding_dim: 256
num_heads: 1
downsample_rate: 1
dropout: 0.1
kv_in_dim: 64
num_layers: 4
memory_encoder:
_target_: sam2.modeling.memory_encoder.MemoryEncoder
out_dim: 64
position_encoding:
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
num_pos_feats: 64
normalize: true
scale: null
temperature: 10000
mask_downsampler:
_target_: sam2.modeling.memory_encoder.MaskDownSampler
kernel_size: 3
stride: 2
padding: 1
fuser:
_target_: sam2.modeling.memory_encoder.Fuser
layer:
_target_: sam2.modeling.memory_encoder.CXBlock
dim: 256
kernel_size: 7
padding: 3
layer_scale_init_value: 1e-6
use_dwconv: True
num_layers: 2
num_maskmem: 7
image_size: 1024
sigmoid_scale_for_mem_enc: 20.0
sigmoid_bias_for_mem_enc: -10.0
use_mask_input_as_output_without_sam: true
directly_add_no_mem_embed: true
use_high_res_features_in_sam: true
multimask_output_in_sam: true
multimask_min_pt_num: 0
multimask_max_pt_num: 1
multimask_output_for_tracking: true
use_multimask_token_for_obj_ptr: true
iou_prediction_use_sigmoid: True
memory_temporal_stride_for_eval: 1
non_overlap_masks_for_mem_enc: true
use_obj_ptrs_in_encoder: true
max_obj_ptrs_in_encoder: 16
add_tpos_enc_to_obj_ptrs: false
proj_tpos_enc_in_obj_ptrs: false
use_signed_tpos_enc_to_obj_ptrs: false
only_obj_ptrs_in_the_past_for_eval: true
pred_obj_scores: true
pred_obj_scores_mlp: true
fixed_no_obj_ptr: true
soft_no_obj_ptr: false
use_mlp_for_obj_ptr_proj: true
no_obj_embed_spatial: true
sam_mask_decoder_extra_args:
dynamic_multimask_via_stability: true
dynamic_multimask_stability_delta: 0.05
dynamic_multimask_stability_thresh: 0.98
pred_obj_scores: true
pred_obj_scores_mlp: true
use_multimask_token_for_obj_ptr: true
compile_image_encoder: False