yyliu01 commited on 4 days ago

Commit

c6dfc69

verified ·

1 Parent(s): 5b9ba72

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +4 -0
LICENSE +21 -0
README.md +26 -3
avs.code/v1m.code/configs/__init__.py +0 -0
avs.code/v1m.code/configs/auralfuser/architecture.yaml +30 -0
avs.code/v1m.code/configs/config.py +85 -0
avs.code/v1m.code/configs/sam2/sam2_hiera_b+.yaml +114 -0
avs.code/v1m.code/configs/sam2/sam2_hiera_l.yaml +117 -0
avs.code/v1m.code/configs/sam2/sam2_hiera_s.yaml +116 -0
avs.code/v1m.code/configs/sam2/sam2_hiera_t.yaml +118 -0
avs.code/v1m.code/configs/training/sam2_training_config.yaml +62 -0
avs.code/v1m.code/dataloader/audio/audio_augmentation.py +23 -0
avs.code/v1m.code/dataloader/audio/audio_dataset.py +38 -0
avs.code/v1m.code/dataloader/audio/preprocess_vgg/mel_features.py +223 -0
avs.code/v1m.code/dataloader/audio/preprocess_vgg/vggish_input.py +98 -0
avs.code/v1m.code/dataloader/audio/preprocess_vgg/vggish_params.py +53 -0
avs.code/v1m.code/dataloader/dataset.py +67 -0
avs.code/v1m.code/dataloader/sam2_dataset/__init__.py +5 -0
avs.code/v1m.code/dataloader/sam2_dataset/transforms.py +528 -0
avs.code/v1m.code/dataloader/visual/visual_augmentation.py +140 -0
avs.code/v1m.code/dataloader/visual/visual_dataset.py +127 -0
avs.code/v1m.code/inference.py +193 -0
avs.code/v1m.code/loss/training/__init__.py +2 -0
avs.code/v1m.code/loss/training/contrastive_learning.py +201 -0
avs.code/v1m.code/loss/training/sam2_training_loss.py +220 -0
avs.code/v1m.code/main.py +166 -0
avs.code/v1m.code/model/audio/torchvggish/mel_features.py +223 -0
avs.code/v1m.code/model/audio/torchvggish/vggish.py +193 -0
avs.code/v1m.code/model/audio/torchvggish/vggish_input.py +98 -0
avs.code/v1m.code/model/audio/torchvggish/vggish_params.py +53 -0
avs.code/v1m.code/model/aural_fuser.py +567 -0
avs.code/v1m.code/model/mymodel.py +102 -0
avs.code/v1m.code/model/visual/sam2/__init__.py +11 -0
avs.code/v1m.code/model/visual/sam2/build_sam.py +171 -0
avs.code/v1m.code/model/visual/sam2/modeling/__init__.py +5 -0
avs.code/v1m.code/model/visual/sam2/modeling/backbones/__init__.py +5 -0
avs.code/v1m.code/model/visual/sam2/modeling/backbones/hieradet.py +317 -0
avs.code/v1m.code/model/visual/sam2/modeling/backbones/image_encoder.py +134 -0
avs.code/v1m.code/model/visual/sam2/modeling/backbones/utils.py +95 -0
avs.code/v1m.code/model/visual/sam2/modeling/memory_attention.py +169 -0
avs.code/v1m.code/model/visual/sam2/modeling/memory_encoder.py +181 -0
avs.code/v1m.code/model/visual/sam2/modeling/position_encoding.py +221 -0
avs.code/v1m.code/model/visual/sam2/modeling/sam/__init__.py +5 -0
avs.code/v1m.code/model/visual/sam2/modeling/sam/mask_decoder.py +300 -0
avs.code/v1m.code/model/visual/sam2/modeling/sam/prompt_encoder.py +188 -0
avs.code/v1m.code/model/visual/sam2/modeling/sam/transformer.py +367 -0
avs.code/v1m.code/model/visual/sam2/modeling/sam2_base.py +940 -0
avs.code/v1m.code/model/visual/sam2/modeling/sam2_utils.py +323 -0
avs.code/v1m.code/model/visual/sam2/organised_sam2_train.py +811 -0
avs.code/v1m.code/model/visual/sam2/utils/__init__.py +5 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+ckpts/avs/v1s/nohup.out filter=lfs diff=lfs merge=lfs -text
+ckpts/avs/v2/nohup.out filter=lfs diff=lfs merge=lfs -text
+ckpts/ref-avs/nohup.out filter=lfs diff=lfs merge=lfs -text
+docs/overview.png filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 Yuyuan Liu
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,3 +1,26 @@
----
-license: mit
----

+# AuralSAM2
+> **[CVPRF'26]** [AuralSAM2: Enabling SAM2 Hear
+Through Pyramid Audio-Visual Feature Prompting](#)
+>
+> by Yuyuan Liu, Yuanhong Chen, Chong Wang, Junlin Han, Junde Wu, Can Peng, Jingkun Chen, Yu Tian and Gustavo Carneiro
+>
+<img src="./docs/overview.png" width="850" height="300" />
+## Installation
+please install the dependencies and dataset based on this [***installation***](./docs/installation.md) document.
+## Getting start
+please follow this [***instruction***](./docs/before_start.md) document to reproduce our results.
+## Citation
+please consider citing our work in your publications if it helps your research.
+```bibtex
+@article{liu2025auralsam2,
+  title={AuralSAM2: Enabling SAM2 Hear Through Pyramid Audio-Visual Feature Prompting},
+  author={Liu, Yuyuan and Chen, Yuanhong and Wang, Chong and Han, Junlin and Wu, Junde and Peng, Can and Chen, Jingkun and Tian, Yu and Carneiro, Gustavo},
+  journal={arXiv preprint arXiv:2506.01015},
+  year={2025}
+}
+```

avs.code/v1m.code/configs/__init__.py ADDED Viewed

File without changes

avs.code/v1m.code/configs/auralfuser/architecture.yaml ADDED Viewed

	@@ -0,0 +1,30 @@

+# @package _global_
+aural_fuser:
+  patch_cfgs:
+    - [4, 4]
+    - [2, 2]
+    - [1, 1]
+  f_depths: [3, 6, 12]
+  block_kw:
+    dim: 256
+    num_heads: 4
+    mlp_ratio: 4
+    qkv_bias: true
+    qk_scale: null
+    drop: 0.1
+    attn_drop: 0.1
+    drop_path: 0.0
+    sr_ratio: 4
+    linear: false
+  one_d_kw:
+    dim: 256
+    num_heads: 4
+    mlp_ratio: 4
+    qkv_bias: true
+    qk_scale: null
+    drop: 0.1
+    attn_drop: 0.1
+    drop_path: 0.0
+    sr_ratio: 4
+    linear: false

avs.code/v1m.code/configs/config.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import os
+import numpy
+from easydict import EasyDict
+# v1m.code package root (parent of this `configs/` directory)
+_CODE_ROOT = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+_WORKSPACE_ROOT = os.path.dirname(os.path.dirname(_CODE_ROOT))
+C = EasyDict()
+config = C
+cfg = C
+C.seed = 666
+C.audio = EasyDict()
+C.audio.FREEZE_AUDIO_EXTRACTOR = True
+C.audio.PRETRAINED_VGGISH_MODEL_PATH = os.path.join(_WORKSPACE_ROOT, 'ckpts', 'vggish-10086976.pth')
+C.audio.PREPROCESS_AUDIO_TO_LOG_MEL = False
+C.audio.POSTPROCESS_LOG_MEL_WITH_PCA = False
+C.train_vggish = False
+"""Root Directory Config"""
+C.repo_name = 'AV'
+C.root_dir = _CODE_ROOT
+"""Data Dir and Weight Dir"""
+C.data_root_path = os.path.join(_WORKSPACE_ROOT, 'AVSBench')
+C.data_name = 'v1m'
+C.backbone_weight = os.path.join(_WORKSPACE_ROOT, 'ckpts', 'sam_ckpts', 'sam2_hiera_large.pt')
+C.sam_config_path = os.path.join('sam2', 'sam2_hiera_l.yaml')
+"""Network Config"""
+C.fix_bias = True
+C.bn_eps = 1e-5
+C.bn_momentum = 0.1
+"""Image Config"""
+C.num_classes = 2
+C.image_mean = numpy.array([0.485, 0.456, 0.406])
+C.image_std = numpy.array([0.229, 0.224, 0.225])
+C.image_size = 1024
+C.image_embedding_size = int(C.image_size / 16)
+C.avsbench_size = (224, 224)
+C.scale_list = [.5, .75, 1., 1.25, 1.5]
+C.ignore_index = 255
+"""Train Config"""
+C.lr = 7.5e-5
+C.batch_size = 8
+C.energy_weight = .05
+C.lr_power = 0.9
+C.momentum = 0.9
+C.weight_decay = 0.05
+C.num_workers = 4
+"""Display Config"""
+C.record_info_iter = 20
+C.display_iter = 50
+"""Wandb Config"""
+# Paste your W&B API key here, or set the WANDB_API_KEY environment variable instead.
+C.wandb_key = ""
+# Your project [work_space] name
+C.proj_name = "AVS-final-report"
+C.experiment_name = "v1s-hiera-l"
+# False = no wandb logging (see utils/tensorboard.py)
+C.wandb_online = False
+"""Save Config"""
+C.saved_dir = os.path.join(_WORKSPACE_ROOT, 'ckpts', C.experiment_name)
+import pathlib
+pathlib.Path(C.saved_dir).mkdir(parents=True, exist_ok=True)

avs.code/v1m.code/configs/sam2/sam2_hiera_b+.yaml ADDED Viewed

	@@ -0,0 +1,114 @@

+# @package _global_
+# Model
+model:
+  _target_: model.visual.sam2.organised_sam2_train.SAM2Train
+  image_encoder:
+    _target_: model.visual.sam2.modeling.backbones.image_encoder.ImageEncoder
+    scalp: 1
+    trunk:
+      _target_: model.visual.sam2.modeling.backbones.hieradet.Hiera
+      embed_dim: 112
+      num_heads: 2
+    neck:
+      _target_: model.visual.sam2.modeling.backbones.image_encoder.FpnNeck
+      position_encoding:
+        _target_: model.visual.sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 256
+        normalize: true
+        scale: null
+        temperature: 10000
+      d_model: 256
+      backbone_channel_list: [896, 448, 224, 112]
+      fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
+      fpn_interp_model: nearest
+  memory_attention:
+    _target_: model.visual.sam2.modeling.memory_attention.MemoryAttention
+    d_model: 256
+    pos_enc_at_input: true
+    layer:
+      _target_: model.visual.sam2.modeling.memory_attention.MemoryAttentionLayer
+      activation: relu
+      dim_feedforward: 2048
+      dropout: 0.1
+      pos_enc_at_attn: false
+      self_attention:
+        _target_: model.visual.sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [32, 32]
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+      d_model: 256
+      pos_enc_at_cross_attn_keys: true
+      pos_enc_at_cross_attn_queries: false
+      cross_attention:
+        _target_: model.visual.sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [32, 32]
+        rope_k_repeat: True
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+        kv_in_dim: 64
+    num_layers: 4
+  memory_encoder:
+      _target_: model.visual.sam2.modeling.memory_encoder.MemoryEncoder
+      out_dim: 64
+      position_encoding:
+        _target_: model.visual.sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 64
+        normalize: true
+        scale: null
+        temperature: 10000
+      mask_downsampler:
+        _target_: model.visual.sam2.modeling.memory_encoder.MaskDownSampler
+        kernel_size: 3
+        stride: 2
+        padding: 1
+      fuser:
+        _target_: model.visual.sam2.modeling.memory_encoder.Fuser
+        layer:
+          _target_: model.visual.sam2.modeling.memory_encoder.CXBlock
+          dim: 256
+          kernel_size: 7
+          padding: 3
+          layer_scale_init_value: 1e-6
+          use_dwconv: True  # depth-wise convs
+        num_layers: 2
+  num_maskmem: 7
+  image_size: 1024
+  # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+  sigmoid_scale_for_mem_enc: 20.0
+  sigmoid_bias_for_mem_enc: -10.0
+  use_mask_input_as_output_without_sam: true
+  # Memory
+  directly_add_no_mem_embed: true
+  # use high-resolution feature map in the SAM mask decoder
+  use_high_res_features_in_sam: true
+  # output 3 masks on the first click on initial conditioning frames
+  multimask_output_in_sam: true
+  # SAM heads
+  iou_prediction_use_sigmoid: True
+  # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+  use_obj_ptrs_in_encoder: true
+  add_tpos_enc_to_obj_ptrs: false
+  only_obj_ptrs_in_the_past_for_eval: true
+  # object occlusion prediction
+  pred_obj_scores: true
+  pred_obj_scores_mlp: true
+  fixed_no_obj_ptr: true
+  # multimask tracking settings
+  multimask_output_for_tracking: true
+  use_multimask_token_for_obj_ptr: true
+  multimask_min_pt_num: 0
+  multimask_max_pt_num: 1
+  use_mlp_for_obj_ptr_proj: true
+  # Compilation flag
+  compile_image_encoder: False

avs.code/v1m.code/configs/sam2/sam2_hiera_l.yaml ADDED Viewed

	@@ -0,0 +1,117 @@

+# @package _global_
+# Model
+model:
+  _target_: model.visual.sam2.organised_sam2_train.SAM2Train
+  image_encoder:
+    _target_: model.visual.sam2.modeling.backbones.image_encoder.ImageEncoder
+    scalp: 1
+    trunk:
+      _target_: model.visual.sam2.modeling.backbones.hieradet.Hiera
+      embed_dim: 144
+      num_heads: 2
+      stages: [2, 6, 36, 4]
+      global_att_blocks: [23, 33, 43]
+      window_pos_embed_bkg_spatial_size: [7, 7]
+      window_spec: [8, 4, 16, 8]
+    neck:
+      _target_: model.visual.sam2.modeling.backbones.image_encoder.FpnNeck
+      position_encoding:
+        _target_: model.visual.sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 256
+        normalize: true
+        scale: null
+        temperature: 10000
+      d_model: 256
+      backbone_channel_list: [1152, 576, 288, 144]
+      fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
+      fpn_interp_model: nearest
+  memory_attention:
+    _target_: model.visual.sam2.modeling.memory_attention.MemoryAttention
+    d_model: 256
+    pos_enc_at_input: true
+    layer:
+      _target_: model.visual.sam2.modeling.memory_attention.MemoryAttentionLayer
+      activation: relu
+      dim_feedforward: 2048
+      dropout: 0.1
+      pos_enc_at_attn: false
+      self_attention:
+        _target_: model.visual.sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [32, 32]
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+      d_model: 256
+      pos_enc_at_cross_attn_keys: true
+      pos_enc_at_cross_attn_queries: false
+      cross_attention:
+        _target_: model.visual.sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [32, 32]
+        rope_k_repeat: True
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+        kv_in_dim: 64
+    num_layers: 4
+  memory_encoder:
+      _target_: model.visual.sam2.modeling.memory_encoder.MemoryEncoder
+      out_dim: 64
+      position_encoding:
+        _target_: model.visual.sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 64
+        normalize: true
+        scale: null
+        temperature: 10000
+      mask_downsampler:
+        _target_: model.visual.sam2.modeling.memory_encoder.MaskDownSampler
+        kernel_size: 3
+        stride: 2
+        padding: 1
+      fuser:
+        _target_: model.visual.sam2.modeling.memory_encoder.Fuser
+        layer:
+          _target_: model.visual.sam2.modeling.memory_encoder.CXBlock
+          dim: 256
+          kernel_size: 7
+          padding: 3
+          layer_scale_init_value: 1e-6
+          use_dwconv: True  # depth-wise convs
+        num_layers: 2
+  num_maskmem: 7
+  image_size: 1024
+  # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+  sigmoid_scale_for_mem_enc: 20.0
+  sigmoid_bias_for_mem_enc: -10.0
+  use_mask_input_as_output_without_sam: true
+  # Memory
+  directly_add_no_mem_embed: true
+  # use high-resolution feature map in the SAM mask decoder
+  use_high_res_features_in_sam: true
+  # output 3 masks on the first click on initial conditioning frames
+  multimask_output_in_sam: true
+  # SAM heads
+  iou_prediction_use_sigmoid: True
+  # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+  use_obj_ptrs_in_encoder: true
+  add_tpos_enc_to_obj_ptrs: false
+  only_obj_ptrs_in_the_past_for_eval: true
+  # object occlusion prediction
+  pred_obj_scores: true
+  pred_obj_scores_mlp: true
+  fixed_no_obj_ptr: true
+  # multimask tracking settings
+  multimask_output_for_tracking: true
+  use_multimask_token_for_obj_ptr: true
+  multimask_min_pt_num: 0
+  multimask_max_pt_num: 1
+  use_mlp_for_obj_ptr_proj: true
+  # Compilation flag
+  compile_image_encoder: False

avs.code/v1m.code/configs/sam2/sam2_hiera_s.yaml ADDED Viewed

	@@ -0,0 +1,116 @@

+# @package _global_
+# Model
+model:
+  _target_: sam2.modeling.sam2_base.SAM2Base
+  image_encoder:
+    _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
+    scalp: 1
+    trunk:
+      _target_: sam2.modeling.backbones.hieradet.Hiera
+      embed_dim: 96
+      num_heads: 1
+      stages: [1, 2, 11, 2]
+      global_att_blocks: [7, 10, 13]
+      window_pos_embed_bkg_spatial_size: [7, 7]
+    neck:
+      _target_: sam2.modeling.backbones.image_encoder.FpnNeck
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 256
+        normalize: true
+        scale: null
+        temperature: 10000
+      d_model: 256
+      backbone_channel_list: [768, 384, 192, 96]
+      fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
+      fpn_interp_model: nearest
+  memory_attention:
+    _target_: sam2.modeling.memory_attention.MemoryAttention
+    d_model: 256
+    pos_enc_at_input: true
+    layer:
+      _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
+      activation: relu
+      dim_feedforward: 2048
+      dropout: 0.1
+      pos_enc_at_attn: false
+      self_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [32, 32]
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+      d_model: 256
+      pos_enc_at_cross_attn_keys: true
+      pos_enc_at_cross_attn_queries: false
+      cross_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [32, 32]
+        rope_k_repeat: True
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+        kv_in_dim: 64
+    num_layers: 4
+  memory_encoder:
+      _target_: sam2.modeling.memory_encoder.MemoryEncoder
+      out_dim: 64
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 64
+        normalize: true
+        scale: null
+        temperature: 10000
+      mask_downsampler:
+        _target_: sam2.modeling.memory_encoder.MaskDownSampler
+        kernel_size: 3
+        stride: 2
+        padding: 1
+      fuser:
+        _target_: sam2.modeling.memory_encoder.Fuser
+        layer:
+          _target_: sam2.modeling.memory_encoder.CXBlock
+          dim: 256
+          kernel_size: 7
+          padding: 3
+          layer_scale_init_value: 1e-6
+          use_dwconv: True  # depth-wise convs
+        num_layers: 2
+  num_maskmem: 7
+  image_size: 1024
+  # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+  sigmoid_scale_for_mem_enc: 20.0
+  sigmoid_bias_for_mem_enc: -10.0
+  use_mask_input_as_output_without_sam: true
+  # Memory
+  directly_add_no_mem_embed: true
+  # use high-resolution feature map in the SAM mask decoder
+  use_high_res_features_in_sam: true
+  # output 3 masks on the first click on initial conditioning frames
+  multimask_output_in_sam: true
+  # SAM heads
+  iou_prediction_use_sigmoid: True
+  # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+  use_obj_ptrs_in_encoder: true
+  add_tpos_enc_to_obj_ptrs: false
+  only_obj_ptrs_in_the_past_for_eval: true
+  # object occlusion prediction
+  pred_obj_scores: true
+  pred_obj_scores_mlp: true
+  fixed_no_obj_ptr: true
+  # multimask tracking settings
+  multimask_output_for_tracking: true
+  use_multimask_token_for_obj_ptr: true
+  multimask_min_pt_num: 0
+  multimask_max_pt_num: 1
+  use_mlp_for_obj_ptr_proj: true
+  # Compilation flag
+  compile_image_encoder: False

avs.code/v1m.code/configs/sam2/sam2_hiera_t.yaml ADDED Viewed

	@@ -0,0 +1,118 @@

+# @package _global_
+# Model
+model:
+  _target_: model.visual.sam2.organised_sam2_train.SAM2Train
+  image_encoder:
+    _target_: model.visual.sam2.modeling.backbones.image_encoder.ImageEncoder
+    scalp: 1
+    trunk:
+      _target_: model.visual.sam2.modeling.backbones.hieradet.Hiera
+      embed_dim: 96
+      num_heads: 1
+      stages: [1, 2, 7, 2]
+      global_att_blocks: [5, 7, 9]
+      window_pos_embed_bkg_spatial_size: [7, 7]
+    neck:
+      _target_: model.visual.sam2.modeling.backbones.image_encoder.FpnNeck
+      position_encoding:
+        _target_: model.visual.sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 256
+        normalize: true
+        scale: null
+        temperature: 10000
+      d_model: 256
+      backbone_channel_list: [768, 384, 192, 96]
+      fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
+      fpn_interp_model: nearest
+  memory_attention:
+    _target_: model.visual.sam2.modeling.memory_attention.MemoryAttention
+    d_model: 256
+    pos_enc_at_input: true
+    layer:
+      _target_: model.visual.sam2.modeling.memory_attention.MemoryAttentionLayer
+      activation: relu
+      dim_feedforward: 2048
+      dropout: 0.1
+      pos_enc_at_attn: false
+      self_attention:
+        _target_: model.visual.sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [32, 32]
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+      d_model: 256
+      pos_enc_at_cross_attn_keys: true
+      pos_enc_at_cross_attn_queries: false
+      cross_attention:
+        _target_: model.visual.sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [32, 32]
+        rope_k_repeat: True
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+        kv_in_dim: 64
+    num_layers: 4
+  memory_encoder:
+      _target_: model.visual.sam2.modeling.memory_encoder.MemoryEncoder
+      out_dim: 64
+      position_encoding:
+        _target_: model.visual.sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 64
+        normalize: true
+        scale: null
+        temperature: 10000
+      mask_downsampler:
+        _target_: model.visual.sam2.modeling.memory_encoder.MaskDownSampler
+        kernel_size: 3
+        stride: 2
+        padding: 1
+      fuser:
+        _target_: model.visual.sam2.modeling.memory_encoder.Fuser
+        layer:
+          _target_: model.visual.sam2.modeling.memory_encoder.CXBlock
+          dim: 256
+          kernel_size: 7
+          padding: 3
+          layer_scale_init_value: 1e-6
+          use_dwconv: True  # depth-wise convs
+        num_layers: 2
+  num_maskmem: 7
+  image_size: 224 # 1024
+  # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+  # SAM decoder
+  sigmoid_scale_for_mem_enc: 20.0
+  sigmoid_bias_for_mem_enc: -10.0
+  use_mask_input_as_output_without_sam: true
+  # Memory
+  directly_add_no_mem_embed: true
+  # use high-resolution feature map in the SAM mask decoder
+  use_high_res_features_in_sam: true
+  # output 3 masks on the first click on initial conditioning frames
+  multimask_output_in_sam: true
+  # SAM heads
+  iou_prediction_use_sigmoid: True
+  # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+  use_obj_ptrs_in_encoder: true
+  add_tpos_enc_to_obj_ptrs: false
+  only_obj_ptrs_in_the_past_for_eval: false
+  # object occlusion prediction
+  pred_obj_scores: true
+  pred_obj_scores_mlp: true
+  fixed_no_obj_ptr: true
+  # multimask tracking settings
+  multimask_output_for_tracking: true
+  use_multimask_token_for_obj_ptr: true
+  multimask_min_pt_num: 0
+  multimask_max_pt_num: 1
+  use_mlp_for_obj_ptr_proj: true
+  # Compilation flag
+  # HieraT does not currently support compilation, should always be set to False
+  compile_image_encoder: False

avs.code/v1m.code/configs/training/sam2_training_config.yaml ADDED Viewed

	@@ -0,0 +1,62 @@

+# @package _global_
+# Video transforms
+train_transforms:
+  - _target_: dataloader.sam2_dataset.transforms.ComposeAPI
+    transforms:
+      - _target_: dataloader.sam2_dataset.transforms.RandomHorizontalFlip
+        consistent_transform: True
+      - _target_: dataloader.sam2_dataset.transforms.RandomAffine
+        degrees: 25
+        shear: 20
+        image_interpolation: bilinear
+        consistent_transform: True
+      - _target_: dataloader.sam2_dataset.transforms.RandomResizeAPI
+        sizes: 1024 # ${scratch.resolution}
+        square: true
+        consistent_transform: True
+      - _target_: dataloader.sam2_dataset.transforms.ColorJitter
+        consistent_transform: True
+        brightness: 0.1
+        contrast: 0.03
+        saturation: 0.03
+        hue: null
+      - _target_: dataloader.sam2_dataset.transforms.RandomGrayscale
+        p: 0.05
+        consistent_transform: True
+      - _target_: dataloader.sam2_dataset.transforms.ColorJitter
+        consistent_transform: False
+        brightness: 0.1
+        contrast: 0.05
+        saturation: 0.05
+        hue: null
+      - _target_: dataloader.sam2_dataset.transforms.ToTensorAPI
+      - _target_: dataloader.sam2_dataset.transforms.NormalizeAPI
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+loss:
+  all:
+    _target_: loss.training.sam2_training_loss.MultiStepMultiMasksAndIous
+    weight_dict:
+      loss_mask: 20 # 20
+      loss_dice: 1
+      loss_iou: 1
+      loss_class: 1
+    supervise_all_iou: true
+    iou_use_l1_loss: true
+    pred_obj_scores: true
+    focal_gamma_obj_score: 0.0
+    focal_alpha_obj_score: -1.0
+    gpu_num: 4.
+# Contrastive loss (ContrastLoss); loaded in main.py / inference.py → hyp_param.contrastive_learning
+contrastive_learning:
+  temperature: 0.10
+  ignore_idx: 255
+  ood_idx: 254
+  max_views: 512
+  proj_dim: 512
+  sample_limits: 128
+  total_limits: 15240

avs.code/v1m.code/dataloader/audio/audio_augmentation.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import numpy
+class Augmentation(object):
+    """Audio pre-step used by training/inference: int16 waveform -> float in [-1, 1].
+    The previous audiomentations-based transforms were commented out and never applied;
+    behavior is unchanged: only scaling by 1/32768.
+    """
+    def __init__(self, mono=True):
+        self.mono = mono
+    def train_aug(self, x_, sr_):
+        x_ = x_ / 32768.0
+        return x_
+    def test_process(self, x_):
+        x_ = x_ / 32768.0
+        return x_
+    def __call__(self, x, sr, split):
+        return self.train_aug(x, sr) if split == "train" else self.test_process(x)

avs.code/v1m.code/dataloader/audio/audio_dataset.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import torch
+import numpy
+import os
+from dataloader.audio.preprocess_vgg.vggish_input import waveform_to_examples
+import soundfile
+class Audio(torch.utils.data.Dataset):
+    def __init__(self, augmentation, directory_path, split):
+        # temporarily set no augmentation.
+        self.augmentation = augmentation
+        self.directory_path = directory_path
+        self.split = split
+    def load_audio_wave(self, file_index, file_index_mix):
+        audio_path = os.path.join(file_index, 'audio.wav')
+        wav_data, sample_rate = soundfile.read(audio_path, dtype='int16')
+        assert wav_data.dtype == numpy.int16, 'Bad sample type: %r' % wav_data.dtype
+        if file_index_mix is not None:
+            audio_path2 = os.path.join(file_index_mix, 'audio.wav')
+            wav_data2, _ = soundfile.read(audio_path2, dtype='int16')
+            mix_lambda = numpy.random.beta(10, 10)
+            min_length = min(wav_data.shape[0], wav_data2.shape[0])
+            wav_data = wav_data[:min_length] * mix_lambda + wav_data2[:min_length] * (1-mix_lambda)
+        wav_data = self.augmentation(wav_data, sample_rate, self.split)
+        audio_log_mel = torch.cat([waveform_to_examples(wav_data[:, 0], sample_rate, True).detach(),
+                                   waveform_to_examples(wav_data[:, 1], sample_rate, True).detach()], dim=1)
+        # for the vgg preprocess, we will need 5 seconds audio log.
+        if audio_log_mel.shape[0] < 5:
+            audio_log_mel = torch.cat([audio_log_mel,
+                                       audio_log_mel[-1].unsqueeze(0).repeat(5-audio_log_mel.shape[0], 1, 1, 1)])
+        return audio_log_mel
+    def __len__(self):
+        return len(self.audio_list)

avs.code/v1m.code/dataloader/audio/preprocess_vgg/mel_features.py ADDED Viewed

	@@ -0,0 +1,223 @@

+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Defines routines to compute mel spectrogram features from audio waveform."""
+import numpy as np
+def frame(data, window_length, hop_length):
+  """Convert array into a sequence of successive possibly overlapping frames.
+  An n-dimensional array of shape (num_samples, ...) is converted into an
+  (n+1)-D array of shape (num_frames, window_length, ...), where each frame
+  starts hop_length points after the preceding one.
+  This is accomplished using stride_tricks, so the original data is not
+  copied.  However, there is no zero-padding, so any incomplete frames at the
+  end are not included.
+  Args:
+    data: np.array of dimension N >= 1.
+    window_length: Number of samples in each frame.
+    hop_length: Advance (in samples) between each window.
+  Returns:
+    (N+1)-D np.array with as many rows as there are complete frames that can be
+    extracted.
+  """
+  num_samples = data.shape[0]
+  num_frames = 1 + int(np.floor((num_samples - window_length) / hop_length))
+  shape = (num_frames, window_length) + data.shape[1:]
+  strides = (data.strides[0] * hop_length,) + data.strides
+  return np.lib.stride_tricks.as_strided(data, shape=shape, strides=strides)
+def periodic_hann(window_length):
+  """Calculate a "periodic" Hann window.
+  The classic Hann window is defined as a raised cosine that starts and
+  ends on zero, and where every value appears twice, except the middle
+  point for an odd-length window.  Matlab calls this a "symmetric" window
+  and np.hanning() returns it.  However, for Fourier analysis, this
+  actually represents just over one cycle of a period N-1 cosine, and
+  thus is not compactly expressed on a length-N Fourier basis.  Instead,
+  it's better to use a raised cosine that ends just before the final
+  zero value - i.e. a complete cycle of a period-N cosine.  Matlab
+  calls this a "periodic" window. This routine calculates it.
+  Args:
+    window_length: The number of points in the returned window.
+  Returns:
+    A 1D np.array containing the periodic hann window.
+  """
+  return 0.5 - (0.5 * np.cos(2 * np.pi / window_length *
+                             np.arange(window_length)))
+def stft_magnitude(signal, fft_length,
+                   hop_length=None,
+                   window_length=None):
+  """Calculate the short-time Fourier transform magnitude.
+  Args:
+    signal: 1D np.array of the input time-domain signal.
+    fft_length: Size of the FFT to apply.
+    hop_length: Advance (in samples) between each frame passed to FFT.
+    window_length: Length of each block of samples to pass to FFT.
+  Returns:
+    2D np.array where each row contains the magnitudes of the fft_length/2+1
+    unique values of the FFT for the corresponding frame of input samples.
+  """
+  frames = frame(signal, window_length, hop_length)
+  # Apply frame window to each frame. We use a periodic Hann (cosine of period
+  # window_length) instead of the symmetric Hann of np.hanning (period
+  # window_length-1).
+  window = periodic_hann(window_length)
+  windowed_frames = frames * window
+  return np.abs(np.fft.rfft(windowed_frames, int(fft_length)))
+# Mel spectrum constants and functions.
+_MEL_BREAK_FREQUENCY_HERTZ = 700.0
+_MEL_HIGH_FREQUENCY_Q = 1127.0
+def hertz_to_mel(frequencies_hertz):
+  """Convert frequencies to mel scale using HTK formula.
+  Args:
+    frequencies_hertz: Scalar or np.array of frequencies in hertz.
+  Returns:
+    Object of same size as frequencies_hertz containing corresponding values
+    on the mel scale.
+  """
+  return _MEL_HIGH_FREQUENCY_Q * np.log(
+      1.0 + (frequencies_hertz / _MEL_BREAK_FREQUENCY_HERTZ))
+def spectrogram_to_mel_matrix(num_mel_bins=20,
+                              num_spectrogram_bins=129,
+                              audio_sample_rate=8000,
+                              lower_edge_hertz=125.0,
+                              upper_edge_hertz=3800.0):
+  """Return a matrix that can post-multiply spectrogram rows to make mel.
+  Returns a np.array matrix A that can be used to post-multiply a matrix S of
+  spectrogram values (STFT magnitudes) arranged as frames x bins to generate a
+  "mel spectrogram" M of frames x num_mel_bins.  M = S A.
+  The classic HTK algorithm exploits the complementarity of adjacent mel bands
+  to multiply each FFT bin by only one mel weight, then add it, with positive
+  and negative signs, to the two adjacent mel bands to which that bin
+  contributes.  Here, by expressing this operation as a matrix multiply, we go
+  from num_fft multiplies per frame (plus around 2*num_fft adds) to around
+  num_fft^2 multiplies and adds.  However, because these are all presumably
+  accomplished in a single call to np.dot(), it's not clear which approach is
+  faster in Python.  The matrix multiplication has the attraction of being more
+  general and flexible, and much easier to read.
+  Args:
+    num_mel_bins: How many bands in the resulting mel spectrum.  This is
+      the number of columns in the output matrix.
+    num_spectrogram_bins: How many bins there are in the source spectrogram
+      data, which is understood to be fft_size/2 + 1, i.e. the spectrogram
+      only contains the nonredundant FFT bins.
+    audio_sample_rate: Samples per second of the audio at the input to the
+      spectrogram. We need this to figure out the actual frequencies for
+      each spectrogram bin, which dictates how they are mapped into mel.
+    lower_edge_hertz: Lower bound on the frequencies to be included in the mel
+      spectrum.  This corresponds to the lower edge of the lowest triangular
+      band.
+    upper_edge_hertz: The desired top edge of the highest frequency band.
+  Returns:
+    An np.array with shape (num_spectrogram_bins, num_mel_bins).
+  Raises:
+    ValueError: if frequency edges are incorrectly ordered or out of range.
+  """
+  nyquist_hertz = audio_sample_rate / 2.
+  if lower_edge_hertz < 0.0:
+    raise ValueError("lower_edge_hertz %.1f must be >= 0" % lower_edge_hertz)
+  if lower_edge_hertz >= upper_edge_hertz:
+    raise ValueError("lower_edge_hertz %.1f >= upper_edge_hertz %.1f" %
+                     (lower_edge_hertz, upper_edge_hertz))
+  if upper_edge_hertz > nyquist_hertz:
+    raise ValueError("upper_edge_hertz %.1f is greater than Nyquist %.1f" %
+                     (upper_edge_hertz, nyquist_hertz))
+  spectrogram_bins_hertz = np.linspace(0.0, nyquist_hertz, num_spectrogram_bins)
+  spectrogram_bins_mel = hertz_to_mel(spectrogram_bins_hertz)
+  # The i'th mel band (starting from i=1) has center frequency
+  # band_edges_mel[i], lower edge band_edges_mel[i-1], and higher edge
+  # band_edges_mel[i+1].  Thus, we need num_mel_bins + 2 values in
+  # the band_edges_mel arrays.
+  band_edges_mel = np.linspace(hertz_to_mel(lower_edge_hertz),
+                               hertz_to_mel(upper_edge_hertz), num_mel_bins + 2)
+  # Matrix to post-multiply feature arrays whose rows are num_spectrogram_bins
+  # of spectrogram values.
+  mel_weights_matrix = np.empty((num_spectrogram_bins, num_mel_bins))
+  for i in range(num_mel_bins):
+    lower_edge_mel, center_mel, upper_edge_mel = band_edges_mel[i:i + 3]
+    # Calculate lower and upper slopes for every spectrogram bin.
+    # Line segments are linear in the *mel* domain, not hertz.
+    lower_slope = ((spectrogram_bins_mel - lower_edge_mel) /
+                   (center_mel - lower_edge_mel))
+    upper_slope = ((upper_edge_mel - spectrogram_bins_mel) /
+                   (upper_edge_mel - center_mel))
+    # .. then intersect them with each other and zero.
+    mel_weights_matrix[:, i] = np.maximum(0.0, np.minimum(lower_slope,
+                                                          upper_slope))
+  # HTK excludes the spectrogram DC bin; make sure it always gets a zero
+  # coefficient.
+  mel_weights_matrix[0, :] = 0.0
+  return mel_weights_matrix
+def log_mel_spectrogram(data,
+                        audio_sample_rate=8000,
+                        log_offset=0.0,
+                        window_length_secs=0.025,
+                        hop_length_secs=0.010,
+                        **kwargs):
+  """Convert waveform to a log magnitude mel-frequency spectrogram.
+  Args:
+    data: 1D np.array of waveform data.
+    audio_sample_rate: The sampling rate of data.
+    log_offset: Add this to values when taking log to avoid -Infs.
+    window_length_secs: Duration of each window to analyze.
+    hop_length_secs: Advance between successive analysis windows.
+    **kwargs: Additional arguments to pass to spectrogram_to_mel_matrix.
+  Returns:
+    2D np.array of (num_frames, num_mel_bins) consisting of log mel filterbank
+    magnitudes for successive frames.
+  """
+  window_length_samples = int(round(audio_sample_rate * window_length_secs))
+  hop_length_samples = int(round(audio_sample_rate * hop_length_secs))
+  fft_length = 2 ** int(np.ceil(np.log(window_length_samples) / np.log(2.0)))
+  spectrogram = stft_magnitude(
+      data,
+      fft_length=fft_length,
+      hop_length=hop_length_samples,
+      window_length=window_length_samples)
+  mel_spectrogram = np.dot(spectrogram, spectrogram_to_mel_matrix(
+      num_spectrogram_bins=spectrogram.shape[1],
+      audio_sample_rate=audio_sample_rate, **kwargs))
+  return np.log(mel_spectrogram + log_offset)

avs.code/v1m.code/dataloader/audio/preprocess_vgg/vggish_input.py ADDED Viewed

	@@ -0,0 +1,98 @@

+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Compute input examples for VGGish from audio waveform."""
+# Modification: Return torch tensors rather than numpy arrays
+import torch
+import numpy as np
+import resampy
+from dataloader.audio.preprocess_vgg import mel_features
+from dataloader.audio.preprocess_vgg import vggish_params
+import soundfile as sf
+def waveform_to_examples(data, sample_rate, return_tensor=True):
+    """Converts audio waveform into an array of examples for VGGish.
+  Args:
+    data: np.array of either one dimension (mono) or two dimensions
+      (multi-channel, with the outer dimension representing channels).
+      Each sample is generally expected to lie in the range [-1.0, +1.0],
+      although this is not required.
+    sample_rate: Sample rate of data.
+    return_tensor: Return data as a Pytorch tensor ready for VGGish
+  Returns:
+    3-D np.array of shape [num_examples, num_frames, num_bands] which represents
+    a sequence of examples, each of which contains a patch of log mel
+    spectrogram, covering num_frames frames of audio and num_bands mel frequency
+    bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS.
+  """
+    # Convert to mono.
+    if len(data.shape) > 1:
+        data = np.mean(data, axis=1)
+    # Resample to the rate assumed by VGGish.
+    if sample_rate != vggish_params.SAMPLE_RATE:
+        data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE)
+    # Compute log mel spectrogram features.
+    log_mel = mel_features.log_mel_spectrogram(
+        data,
+        audio_sample_rate=vggish_params.SAMPLE_RATE,
+        log_offset=vggish_params.LOG_OFFSET,
+        window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS,
+        hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS,
+        num_mel_bins=vggish_params.NUM_MEL_BINS,
+        lower_edge_hertz=vggish_params.MEL_MIN_HZ,
+        upper_edge_hertz=vggish_params.MEL_MAX_HZ)
+    # Frame features into examples.
+    features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS
+    example_window_length = int(round(
+        vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))
+    example_hop_length = int(round(
+        vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate))
+    log_mel_examples = mel_features.frame(
+        log_mel,
+        window_length=example_window_length,
+        hop_length=example_hop_length)
+    if return_tensor:
+        log_mel_examples = torch.tensor(
+            log_mel_examples, requires_grad=True)[:, None, :, :].float()
+    return log_mel_examples
+def wavfile_to_examples(wav_file, return_tensor=True):
+    """Convenience wrapper around waveform_to_examples() for a common WAV format.
+  Args:
+    wav_file: String path to a file, or a file-like object. The file
+    is assumed to contain WAV audio data with signed 16-bit PCM samples.
+    torch: Return data as a Pytorch tensor ready for VGGish
+  Returns:
+    See waveform_to_examples.
+  """
+    wav_data, sr = sf.read(wav_file, dtype='int16')
+    assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype
+    samples = wav_data / 32768.0  # Convert to [-1.0, +1.0]
+    return waveform_to_examples(samples, sr, return_tensor)

avs.code/v1m.code/dataloader/audio/preprocess_vgg/vggish_params.py ADDED Viewed

	@@ -0,0 +1,53 @@

+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Global parameters for the VGGish model.
+See vggish_slim.py for more information.
+"""
+# Architectural constants.
+NUM_FRAMES = 96  # Frames in input mel-spectrogram patch.
+NUM_BANDS = 64  # Frequency bands in input mel-spectrogram patch.
+EMBEDDING_SIZE = 128  # Size of embedding layer.
+# Hyperparameters used in feature and example generation.
+SAMPLE_RATE = 16000
+STFT_WINDOW_LENGTH_SECONDS = 0.025
+STFT_HOP_LENGTH_SECONDS = 0.010
+NUM_MEL_BINS = NUM_BANDS
+MEL_MIN_HZ = 125
+MEL_MAX_HZ = 7500
+LOG_OFFSET = 0.01  # Offset used for stabilized log of input mel-spectrogram.
+EXAMPLE_WINDOW_SECONDS = 0.96  # Each example contains 96 10ms frames
+EXAMPLE_HOP_SECONDS = 0.96  # with zero overlap.
+# Parameters used for embedding postprocessing.
+PCA_EIGEN_VECTORS_NAME = 'pca_eigen_vectors'
+PCA_MEANS_NAME = 'pca_means'
+QUANTIZE_MIN_VAL = -2.0
+QUANTIZE_MAX_VAL = +2.0
+# Hyperparameters used in training.
+INIT_STDDEV = 0.01  # Standard deviation used to initialize weights.
+LEARNING_RATE = 1e-4  # Learning rate for the Adam optimizer.
+ADAM_EPSILON = 1e-8  # Epsilon for the Adam optimizer.
+# Names of ops, tensors, and features.
+INPUT_OP_NAME = 'vggish/input_features'
+INPUT_TENSOR_NAME = INPUT_OP_NAME + ':0'
+OUTPUT_OP_NAME = 'vggish/embedding'
+OUTPUT_TENSOR_NAME = OUTPUT_OP_NAME + ':0'
+AUDIO_EMBEDDING_FEATURE_NAME = 'audio_embedding'

avs.code/v1m.code/dataloader/dataset.py ADDED Viewed

	@@ -0,0 +1,67 @@

+"""Fused audio-visual dataset for AVSBench-style indexing."""
+import os
+import random
+import PIL.Image
+import numpy
+import torch
+from dataloader.visual.visual_dataset import Visual
+from dataloader.audio.audio_dataset import Audio
+import pandas
+class AV(torch.utils.data.Dataset):
+    """Pairs video frames + labels from `Visual` with log-mel spectrograms from `Audio` via `metadata.csv`."""
+    def __init__(self, split, augmentation, param, root_path='', data_name='find'):
+        self.visual_dataset = Visual(augmentation['visual'], os.path.join(root_path, data_name), split, param.image_size, param.image_embedding_size)
+        self.audio_dataset = Audio(augmentation['audio'], os.path.join(root_path, data_name), split)
+        self.augment = augmentation
+        self.split = split
+        self.file_path = self.organise_files(self.split, root_path, data_name, csv_name_='avss_index/metadata.csv')
+    def __getitem__(self, index):
+        mixing_prob = 0. # we omit this option.
+        other_index = random.randint(1, self.__len__()) - 1 if random.random() < mixing_prob and self.split == 'train' else None
+        frame, label, prompts = self.visual_dataset.load_data(self.file_path[index])
+        if other_index is not None:
+            other_frame, other_label, other_prompts = self.visual_dataset.load_data(self.file_path[other_index])
+            frame, label, prompts = self.visual_mix(frame, other_frame, label, other_label, prompts, other_prompts)
+            audio_mel = self.audio_dataset.load_audio_wave(self.file_path[index], self.file_path[other_index])
+        else:
+            audio_mel = self.audio_dataset.load_audio_wave(self.file_path[index], None)
+        assert other_index is None if self.split == 'test' else 1, print('no mix in validation.')
+        return {'frame': frame, 'label': label, 'spectrogram': audio_mel, 'id': self.file_path[index],
+                'prompts': prompts}
+    def __len__(self):
+        return len(self.file_path)
+    @staticmethod
+    def organise_files(split_, root_path_, data_name_, csv_name_):
+        """Read rows from `csv_name_` under `root_path_` matching split and dataset label."""
+        total_files = pandas.read_csv(os.path.join(root_path_, csv_name_))
+        files_info = total_files[(total_files["split"] == split_) & (total_files["label"] == data_name_)]['uid']
+        files_path = [os.path.join(root_path_, data_name_, files_name) for files_name in files_info]
+        del total_files, files_info
+        return files_path
+    @staticmethod
+    def visual_mix(frame1, frame2, label1, label2, prompts1, prompts2):
+        mix_frame = frame1.clone()
+        mix_label = label1.clone()
+        bbx1, bby1, bbx2, bby2 = 0, 0, mix_label.shape[1] - 1, mix_label.shape[2] - 1
+        for i in range(0, mix_frame.shape[0]):
+            label_canvas_foreground = label2[i, bbx1:bbx2, bby1:bby2] > 0.
+            mix_frame[i, :, bbx1:bbx2, bby1:bby2][:, label_canvas_foreground] = (
+                    frame2[i, :, bbx1:bbx2, bby1:bby2][:, label_canvas_foreground])
+            mix_label[i, bbx1:bbx2, bby1:bby2][label_canvas_foreground] = (
+                    label2[i, bbx1:bbx2, bby1:bby2][label_canvas_foreground])
+        return mix_frame, mix_label, prompts1

avs.code/v1m.code/dataloader/sam2_dataset/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.

avs.code/v1m.code/dataloader/sam2_dataset/transforms.py ADDED Viewed

	@@ -0,0 +1,528 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Transforms and data augmentation for both image + bbox.
+"""
+import logging
+import random
+from typing import Iterable
+import torch
+import torchvision.transforms as T
+import torchvision.transforms.functional as F
+import torchvision.transforms.v2.functional as Fv2
+from PIL import Image as PILImage
+# from docutils.nodes import label
+import numpy
+from torchvision.transforms import InterpolationMode
+# from utils.data_utils import VideoDatapoint
+def hflip(frames, labels, index):
+    # print(index)
+    # print(len(frames), frames[index].size, type(frames[index]))
+    # print(len(labels), labels[index].size, type(labels[index]))
+    frames[index] = F.hflip(frames[index])
+    labels[index] = F.hflip(labels[index])
+    # for obj in frames[index].objects:
+    #     if obj.segment is not None:
+    #         obj.segment = F.hflip(obj.segment)
+    return frames, labels
+def get_size_with_aspect_ratio(image_size, size, max_size=None):
+    w, h = image_size
+    if max_size is not None:
+        min_original_size = float(min((w, h)))
+        max_original_size = float(max((w, h)))
+        if max_original_size / min_original_size * size > max_size:
+            size = max_size * min_original_size / max_original_size
+    if (w <= h and w == size) or (h <= w and h == size):
+        return (h, w)
+    if w < h:
+        ow = int(round(size))
+        oh = int(round(size * h / w))
+    else:
+        oh = int(round(size))
+        ow = int(round(size * w / h))
+    return (oh, ow)
+def resize(frames, labels, index, size, max_size=None, square=False, v2=False):
+    # size can be min_size (scalar) or (w, h) tuple
+    def get_size(image_size, size, max_size=None):
+        if isinstance(size, (list, tuple)):
+            return size[::-1]
+        else:
+            return get_size_with_aspect_ratio(image_size, size, max_size)
+    if square:
+        size = size, size
+    else:
+        raise NotImplementedError
+        # cur_size = (
+        #     frames[index].data.size()[-2:][::-1]
+        #     if v2
+        #     else frames[index].data.size
+        # )
+        # size = get_size(cur_size, size, max_size)
+    # old_size = (
+    #     frames[index].data.size()[-2:][::-1]
+    #     if v2
+    #     else frames[index].data.size
+    # )
+    if v2:
+        frames[index].data = Fv2.resize(
+            frames[index].data, size, antialias=True
+        )
+    else:
+        frames[index] = F.resize(frames[index], size)
+        labels[index] = F.resize(labels[index], size)
+    # new_size = (
+    #     frames[index].data.size()[-2:][::-1]
+    #     if v2
+    #     else frames[index].data.size
+    # )
+    # for obj in frames[index].objects:
+    #     if obj.segment is not None:
+    #         obj.segment = F.resize(obj.segment[None, None], size).squeeze()
+    # h, w = size
+    # frames[index].size = (h, w)
+    return frames, labels
+def pad(frames, index, padding, v2=False):
+    old_h, old_w = frames[index].size
+    h, w = old_h, old_w
+    if len(padding) == 2:
+        # assumes that we only pad on the bottom right corners
+        frames[index].data = F.pad(
+            frames[index].data, (0, 0, padding[0], padding[1])
+        )
+        h += padding[1]
+        w += padding[0]
+    else:
+        # left, top, right, bottom
+        frames[index].data = F.pad(
+            frames[index].data,
+            (padding[0], padding[1], padding[2], padding[3]),
+        )
+        h += padding[1] + padding[3]
+        w += padding[0] + padding[2]
+    frames[index].size = (h, w)
+    for obj in frames[index].objects:
+        if obj.segment is not None:
+            if v2:
+                if len(padding) == 2:
+                    obj.segment = Fv2.pad(obj.segment, (0, 0, padding[0], padding[1]))
+                else:
+                    obj.segment = Fv2.pad(obj.segment, tuple(padding))
+            else:
+                if len(padding) == 2:
+                    obj.segment = F.pad(obj.segment, (0, 0, padding[0], padding[1]))
+                else:
+                    obj.segment = F.pad(obj.segment, tuple(padding))
+    return frames
+class RandomHorizontalFlip:
+    def __init__(self, consistent_transform, p=0.5):
+        self.p = p
+        self.consistent_transform = consistent_transform
+    def __call__(self, frames, labels, **kwargs):
+        if self.consistent_transform:
+            if random.random() < self.p:
+                for i in range(len(frames)):
+                    frames, labels = hflip(frames, labels, i)
+            return frames, labels
+        for i in range(len(frames)):
+            if random.random() < self.p:
+                    frames, labels = hflip(frames, labels, i)
+        return frames, labels
+class RandomResizeAPI:
+    def __init__(
+        self, sizes, consistent_transform, max_size=None, square=False, v2=False
+    ):
+        if isinstance(sizes, int):
+            sizes = (sizes,)
+        assert isinstance(sizes, Iterable)
+        self.sizes = list(sizes)
+        self.max_size = max_size
+        self.square = square
+        self.consistent_transform = consistent_transform
+        self.v2 = v2
+    def __call__(self, frames, labels):
+        if self.consistent_transform:
+            size = random.choice(self.sizes)
+            for i in range(len(frames)):
+                frames, labels = resize(
+                    frames, labels, i, size, self.max_size, square=self.square, v2=self.v2
+                )
+            return frames, labels
+        for i in range(len(frames)):
+            size = random.choice(self.sizes)
+            frames, labels = resize(
+                frames, labels, i, size, self.max_size, square=self.square, v2=self.v2
+            )
+        return frames, labels
+class ToTensorAPI:
+    def __init__(self, v2=False):
+        self.v2 = v2
+    def __call__(self, frames, labels, **kwargs):
+        for img_idx in range(len(frames)):
+            if self.v2:
+                raise NotImplementedError
+                # frames[img_idx] = Fv2.to_tensor(frames[img_idx])
+            else:
+                frames[img_idx] = F.to_tensor(frames[img_idx])
+                labels[img_idx] = torch.tensor(numpy.array(labels[img_idx]), dtype=torch.float)
+        return frames, labels
+class NormalizeAPI:
+    def __init__(self, mean, std, v2=False):
+        self.mean = mean
+        self.std = std
+        self.v2 = v2
+    def __call__(self, frames, labels, **kwargs):
+        for img_idx in range(len(frames)):
+            # if self.v2:
+            #     img.data = Fv2.convert_image_dtype(img.data, torch.float32)
+            #     img.data = Fv2.normalize(img.data, mean=self.mean, std=self.std)
+            # else:
+            frames[img_idx] = F.normalize(frames[img_idx], mean=self.mean, std=self.std)
+        return frames, labels
+'''
+    <dataloader.sam2_dataset.transforms.RandomHorizontalFlip object at 0x75c815561b40>
+    <dataloader.sam2_dataset.transforms.RandomAffine object at 0x75c815561bd0>
+    <dataloader.sam2_dataset.transforms.RandomResizeAPI object at 0x75c815561c60>
+    <dataloader.sam2_dataset.transforms.ColorJitter object at 0x75c815561cc0>
+    <dataloader.sam2_dataset.transforms.RandomGrayscale object at 0x75c815561cf0>
+    <dataloader.sam2_dataset.transforms.ColorJitter object at 0x75c815561de0>
+    <dataloader.sam2_dataset.transforms.ToTensorAPI object at 0x75c815507280>
+    <dataloader.sam2_dataset.transforms.NormalizeAPI object at 0x75c815507490>
+'''
+class ComposeAPI:
+    def __init__(self, transforms):
+        self.transforms = transforms
+    def __call__(self, frames, labels, **kwargs):
+        for t in self.transforms:
+            frames, labels = t(frames, labels, **kwargs)
+        return frames, labels
+    def __repr__(self):
+        format_string = self.__class__.__name__ + "("
+        for t in self.transforms:
+            format_string += "\n"
+            format_string += "    {0}".format(t)
+        format_string += "\n)"
+        return format_string
+class RandomGrayscale:
+    def __init__(self, consistent_transform, p=0.5):
+        self.p = p
+        self.consistent_transform = consistent_transform
+        self.Grayscale = T.Grayscale(num_output_channels=3)
+    def __call__(self, frames, labels, **kwargs):
+        if self.consistent_transform:
+            if random.random() < self.p:
+                for img_idx in range(len(frames)):
+                    frames[img_idx] = self.Grayscale(frames[img_idx])
+            return frames, labels
+        for img_idx in range(len(frames)):
+            if random.random() < self.p:
+                frames[img_idx] = self.Grayscale(frames[img_idx])
+        return frames, labels
+class ColorJitter:
+    def __init__(self, consistent_transform, brightness, contrast, saturation, hue):
+        self.consistent_transform = consistent_transform
+        self.brightness = (
+            brightness
+            if isinstance(brightness, list)
+            else [max(0, 1 - brightness), 1 + brightness]
+        )
+        self.contrast = (
+            contrast
+            if isinstance(contrast, list)
+            else [max(0, 1 - contrast), 1 + contrast]
+        )
+        self.saturation = (
+            saturation
+            if isinstance(saturation, list)
+            else [max(0, 1 - saturation), 1 + saturation]
+        )
+        self.hue = hue if isinstance(hue, list) or hue is None else ([-hue, hue])
+    def __call__(self, frames, labels, **kwargs):
+        if self.consistent_transform:
+            # Create a color jitter transformation params
+            (
+                fn_idx,
+                brightness_factor,
+                contrast_factor,
+                saturation_factor,
+                hue_factor,
+            ) = T.ColorJitter.get_params(
+                self.brightness, self.contrast, self.saturation, self.hue
+            )
+        for img in frames:
+            if not self.consistent_transform:
+                (
+                    fn_idx,
+                    brightness_factor,
+                    contrast_factor,
+                    saturation_factor,
+                    hue_factor,
+                ) = T.ColorJitter.get_params(
+                    self.brightness, self.contrast, self.saturation, self.hue
+                )
+            for fn_id in fn_idx:
+                if fn_id == 0 and brightness_factor is not None:
+                    img = F.adjust_brightness(img, brightness_factor)
+                elif fn_id == 1 and contrast_factor is not None:
+                    img = F.adjust_contrast(img, contrast_factor)
+                elif fn_id == 2 and saturation_factor is not None:
+                    img = F.adjust_saturation(img, saturation_factor)
+                elif fn_id == 3 and hue_factor is not None:
+                    img = F.adjust_hue(img, hue_factor)
+        return frames, labels
+class RandomAffine:
+    def __init__(
+        self,
+        degrees,
+        consistent_transform,
+        scale=None,
+        translate=None,
+        shear=None,
+        image_mean=(123, 116, 103),
+        label_fill_value=0.,
+        log_warning=True,
+        num_tentatives=1,
+        image_interpolation="bicubic",
+    ):
+        """
+        The mask is required for this transform.
+        if consistent_transform if True, then the same random affine is applied to all frames and masks.
+        """
+        self.degrees = degrees if isinstance(degrees, list) else ([-degrees, degrees])
+        self.scale = scale
+        self.shear = (
+            shear if isinstance(shear, list) else ([-shear, shear] if shear else None)
+        )
+        self.translate = translate
+        self.fill_img = image_mean
+        self.fill_label = label_fill_value
+        self.consistent_transform = consistent_transform
+        self.log_warning = log_warning
+        self.num_tentatives = num_tentatives
+        assert self.num_tentatives >= 1., 'must have at least one if we utilise the augmentation.'
+        if image_interpolation == "bicubic":
+            self.image_interpolation = InterpolationMode.BICUBIC
+        elif image_interpolation == "bilinear":
+            self.image_interpolation = InterpolationMode.BILINEAR
+        else:
+            raise NotImplementedError
+    def __call__(self, frames, labels, **kwargs):
+        for _tentative in range(self.num_tentatives):
+            res_img, res_labels = self.transform_frames(frames, labels)
+            # if res is not None:
+        return res_img, res_labels
+        # raise NotImplementedError
+        # if self.log_warning:
+        #     logging.warning(
+        #         f"Skip RandomAffine for zero-area mask in first frame after {self.num_tentatives} tentatives"
+        #     )
+        # return frames
+    def transform_frames(self, frames, labels):
+        _, height, width = F.get_dimensions(frames[0])
+        img_size = [width, height]
+        if self.consistent_transform:
+            # Create a random affine transformation
+            affine_params = T.RandomAffine.get_params(
+                degrees=self.degrees,
+                translate=self.translate,
+                scale_ranges=self.scale,
+                shears=self.shear,
+                img_size=img_size,
+            )
+        for img_idx, img in enumerate(frames):
+            if not self.consistent_transform:
+                # if not consistent we create a new affine params for every frame&mask pair Create a random affine transformation
+                affine_params = T.RandomAffine.get_params(
+                    degrees=self.degrees,
+                    translate=self.translate,
+                    scale_ranges=self.scale,
+                    shears=self.shear,
+                    img_size=img_size,
+                )
+            frames[img_idx] = F.affine(
+                img,
+                *affine_params,
+                interpolation=self.image_interpolation,
+                fill=self.fill_img,
+            )
+            labels[img_idx] = F.affine(
+                labels[img_idx],
+                *affine_params,
+                # default: interpolation='nearest',
+                fill=self.fill_label,
+            )
+        return frames, labels
+'''
+def random_mosaic_frame(
+    datapoint,
+    index,
+    grid_h,
+    grid_w,
+    target_grid_y,
+    target_grid_x,
+    should_hflip,
+):
+    # Step 1: downsize the images and paste them into a mosaic
+    image_data = datapoint.frames[index].data
+    is_pil = isinstance(image_data, PILImage.Image)
+    if is_pil:
+        H_im = image_data.height
+        W_im = image_data.width
+        image_data_output = PILImage.new("RGB", (W_im, H_im))
+    else:
+        H_im = image_data.size(-2)
+        W_im = image_data.size(-1)
+        image_data_output = torch.zeros_like(image_data)
+    downsize_cache = {}
+    for grid_y in range(grid_h):
+        for grid_x in range(grid_w):
+            y_offset_b = grid_y * H_im // grid_h
+            x_offset_b = grid_x * W_im // grid_w
+            y_offset_e = (grid_y + 1) * H_im // grid_h
+            x_offset_e = (grid_x + 1) * W_im // grid_w
+            H_im_downsize = y_offset_e - y_offset_b
+            W_im_downsize = x_offset_e - x_offset_b
+            if (H_im_downsize, W_im_downsize) in downsize_cache:
+                image_data_downsize = downsize_cache[(H_im_downsize, W_im_downsize)]
+            else:
+                image_data_downsize = F.resize(
+                    image_data,
+                    size=(H_im_downsize, W_im_downsize),
+                    interpolation=InterpolationMode.BILINEAR,
+                    antialias=True,  # antialiasing for downsizing
+                )
+                downsize_cache[(H_im_downsize, W_im_downsize)] = image_data_downsize
+            if should_hflip[grid_y, grid_x].item():
+                image_data_downsize = F.hflip(image_data_downsize)
+            if is_pil:
+                image_data_output.paste(image_data_downsize, (x_offset_b, y_offset_b))
+            else:
+                image_data_output[:, y_offset_b:y_offset_e, x_offset_b:x_offset_e] = (
+                    image_data_downsize
+                )
+    datapoint.frames[index].data = image_data_output
+    # Step 2: downsize the masks and paste them into the target grid of the mosaic
+    for obj in datapoint.frames[index].objects:
+        if obj.segment is None:
+            continue
+        assert obj.segment.shape == (H_im, W_im) and obj.segment.dtype == torch.uint8
+        segment_output = torch.zeros_like(obj.segment)
+        target_y_offset_b = target_grid_y * H_im // grid_h
+        target_x_offset_b = target_grid_x * W_im // grid_w
+        target_y_offset_e = (target_grid_y + 1) * H_im // grid_h
+        target_x_offset_e = (target_grid_x + 1) * W_im // grid_w
+        target_H_im_downsize = target_y_offset_e - target_y_offset_b
+        target_W_im_downsize = target_x_offset_e - target_x_offset_b
+        segment_downsize = F.resize(
+            obj.segment[None, None],
+            size=(target_H_im_downsize, target_W_im_downsize),
+            interpolation=InterpolationMode.BILINEAR,
+            antialias=True,  # antialiasing for downsizing
+        )[0, 0]
+        if should_hflip[target_grid_y, target_grid_x].item():
+            segment_downsize = F.hflip(segment_downsize[None, None])[0, 0]
+        segment_output[
+            target_y_offset_b:target_y_offset_e, target_x_offset_b:target_x_offset_e
+        ] = segment_downsize
+        obj.segment = segment_output
+    return datapoint
+class RandomMosaicVideoAPI:
+    def __init__(self, prob=0.15, grid_h=2, grid_w=2, use_random_hflip=False):
+        self.prob = prob
+        self.grid_h = grid_h
+        self.grid_w = grid_w
+        self.use_random_hflip = use_random_hflip
+    def __call__(self, frames, **kwargs):
+        if random.random() > self.prob:
+            return datapoint
+        # select a random location to place the target mask in the mosaic
+        target_grid_y = random.randint(0, self.grid_h - 1)
+        target_grid_x = random.randint(0, self.grid_w - 1)
+        # whether to flip each grid in the mosaic horizontally
+        if self.use_random_hflip:
+            should_hflip = torch.rand(self.grid_h, self.grid_w) < 0.5
+        else:
+            should_hflip = torch.zeros(self.grid_h, self.grid_w, dtype=torch.bool)
+        for i in range(len(datapoint.frames)):
+            datapoint = random_mosaic_frame(
+                datapoint,
+                i,
+                grid_h=self.grid_h,
+                grid_w=self.grid_w,
+                target_grid_y=target_grid_y,
+                target_grid_x=target_grid_x,
+                should_hflip=should_hflip,
+            )
+        return datapoint
+'''

avs.code/v1m.code/dataloader/visual/visual_augmentation.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import random
+import matplotlib.pyplot as plt
+import numpy
+import torch
+import torchvision.transforms.functional as F
+import torchvision.transforms as transforms
+class Augmentation(object):
+    def __init__(self, image_mean, image_std, image_width, image_height, scale_list, ignore_index=255):
+        self.image_size = (image_height, image_width)
+        # self.image_norm = (image_mean, image_std)
+        # self.get_crop_pos = transforms.RandomCrop(self.image_size)
+        self.color_jitter = transforms.ColorJitter(brightness=.5, contrast=.5, saturation=.5, hue=.25)
+        self.gaussian_blurring = transforms.GaussianBlur((3, 3))
+        self.scale_list = scale_list
+        self.normalise = transforms.Normalize(mean=image_mean, std=image_std)
+        self.to_tensor = transforms.ToTensor()
+        self.ignore_index = ignore_index
+        # self.normalise = transforms.Normalize(mean=image_mean, std=image_std)
+        # if setup == "avs" or setup == "avss" or setup == "avss_binary":
+        #     # AVS
+        #     self.scale_list = [.5, .75, 1.]
+        #     self.color_jitter = None
+        # else:
+        #     # COCO
+        #     # self.scale_list = [.75, 1., 1.25, 1.5, 1.75, 2.]
+        #     self.scale_list = [0.5,0.75,1.0,1.25,1.5,1.75,2.0]
+    # def normalise(self, image):
+    #     image = image / 255.0
+    #     image = image - self.image_norm[0]
+    #     image = image / self.image_norm[1]
+    #     return image
+    def resize(self, image_, label_, size=None):
+        h_, w_ = self.image_size if size is None else size
+        image_ = F.resize(image_, (h_, w_), transforms.InterpolationMode.BICUBIC)
+        label_ = F.resize(label_, (h_, w_), transforms.InterpolationMode.NEAREST)
+        return image_, label_
+    def random_crop_with_padding(self, image_, label_):
+        w_, h_ = image_.size
+        if min(h_, w_) < min(self.image_size):
+            res_w_ = max(self.image_size[0] - w_, 0)
+            res_h_ = max(self.image_size[1] - h_, 0)
+            image_ = F.pad(image_, [0, 0, res_w_, res_h_], fill=(numpy.array(self.image_norm[0]) * 255.).tolist())
+            # image_ = F.pad(image_, [0, 0, res_w_, res_h_], fill=self.ignore_index) # if error, define the padding value.
+            label_ = F.pad(label_, [0, 0, res_w_, res_h_], fill=self.ignore_index)
+        pos_ = self.get_crop_pos.get_params(image_, self.image_size)
+        image_ = F.crop(image_, *pos_)
+        label_ = F.crop(label_, *pos_)
+        return image_, label_
+    # @staticmethod
+    def random_scales(self, image_, label_):
+        w_, h_ = image_.size
+        chosen_scale = random.choice(self.scale_list)
+        w_, h_ = int(w_ * chosen_scale), int(h_ * chosen_scale)
+        image_ = F.resize(image_, (h_, w_), transforms.InterpolationMode.BICUBIC)
+        label_ = F.resize(label_, (h_, w_), transforms.InterpolationMode.NEAREST)
+        return image_, label_
+    @staticmethod
+    def random_flip_h(image_, label_):
+        chosen_flip = random.random() > 0.5
+        image_ = F.hflip(image_) if chosen_flip else image_
+        label_ = F.hflip(label_) if chosen_flip else label_
+        return image_, label_
+    def augment_entire_clip(self, x_list, y_list):
+        degree_ = float(torch.empty(1).uniform_(float(-25.), float(25.)).item())
+        shear_ = [float(torch.empty(1).uniform_(float(-20.), float(20.)).item()),
+                 torch.empty(1).uniform_(float(-20.), float(20.)).item()]
+        dice =  random.random()
+        for index, single_x in enumerate(x_list):
+            if dice <= 0.1:
+                single_x = F.rgb_to_grayscale(single_x, num_output_channels=3)
+            single_x = F.affine(single_x, angle=degree_, shear=shear_, translate=[0,0], scale=1.,
+                               interpolation=transforms.InterpolationMode.BILINEAR, fill=[0., 0., 0.])
+            single_y = F.affine(y_list[index], angle=degree_, shear=shear_, translate=[0,0], scale=1.,
+                               interpolation=transforms.InterpolationMode.NEAREST, fill=[0.])
+            x_list[index] = single_x
+            y_list[index] = single_y
+        return x_list, y_list
+    def train_aug(self, x_, y_):
+        x_, y_ = self.random_flip_h(x_, y_)
+        # # x, y = self.random_scales(x, y)
+        x_, y_ = self.resize(x_, y_)
+        if self.color_jitter is not None and random.random() < 0.5:
+            x_ = self.color_jitter(x_)
+        if self.gaussian_blurring is not None and random.random() < 0.5:
+            x_ = self.gaussian_blurring(x_)
+        # x, y = self.random_crop_with_padding(x, y)
+        x_ = self.normalise(self.to_tensor(x_)).type(torch.float32)
+        # receive pseudo labels.
+        y_ = torch.tensor(numpy.array(y_)[numpy.newaxis, ...], dtype=torch.float)
+        return x_, y_
+    def test_process(self, x_, y_):
+        # x = self.to_tensor(x)
+        # y = torch.tensor(numpy.asarray(y)).long()
+        # following AVSbench setup, we fix image size (224, 224)
+        x_, y_ = self.resize(x_, y_)
+        x_ = self.normalise(self.to_tensor(x_)).type(torch.float32)
+        y_ = torch.tensor(numpy.array(y_)[numpy.newaxis, ...], dtype=torch.float)
+        return x_, y_
+    def __call__(self, x, y, split):
+        return self.train_aug(x, y) if split == "train" \
+            else self.test_process(x, y)

avs.code/v1m.code/dataloader/visual/visual_dataset.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import os
+import re
+import PIL.Image
+import matplotlib.pyplot as plt
+import numpy
+import torch
+import pandas
+import torchvision
+class Visual(torch.utils.data.Dataset):
+    def __init__(self, augmentation, directory_path, split, image_size, image_embedding_size):
+        self.augment = augmentation
+        self.directory_path = directory_path
+        self.split = split
+        self.image_size = image_size
+        self.embedding_size = image_embedding_size
+    def load_data(self, file_prefix):
+        frame_path = os.path.join(file_prefix, 'frames')
+        frame_path = [os.path.join(frame_path, i) for i in os.listdir(frame_path)]
+        label_path = os.path.join(file_prefix, 'labels_rgb')
+        label_path = [os.path.join(label_path, i) for i in os.listdir(label_path)]
+        # if self.split == 'train':
+        #     label_path += [os.path.join(file_prefix.replace('v1s', 'v1s_sam2_pseudo_labels'), i) for i in
+        #                    os.listdir(file_prefix.replace('v1s', 'v1s_sam2_pseudo_labels'))]
+        frame_path.sort(key=lambda x: tuple(map(int, x.split('/')[-1].split("_")[-1].split('.jpg')[0])))
+        label_path.sort(key=lambda x: tuple(map(int, x.split('/')[-1].split("_")[-1].split('.png')[0])))
+        frame = [PIL.Image.open(i) for i in frame_path]
+        label = [PIL.Image.open(i).convert('L') for i in label_path]
+        # if self.split == 'train':
+        #     label += [PIL.Image.new('L', frame[0].size)] * (len(frame)-len(label))
+        label_idx = torch.tensor(list([1] + [0] * 4), dtype=torch.bool)
+        # fulfill the empty page.
+        # we utilise pseudo-labels now.
+        # label_idx = torch.tensor(list([1] + [0] * (len(frame) - len(label))), dtype=torch.bool)
+        # label += [PIL.Image.new('L', frame[0].size)] * (len(frame)-len(label))
+        # receive the prompts from the ground truth.
+        # prompts = {"point_coords": torch.nan, "point_labels": torch.nan,
+        #            "masks": [None]*len(frame), "box_coords": [None]*len(frame)}
+        prompts = {}
+        image_batch = [None]*len(frame)
+        label_batch = [None]*len(frame)
+        if self.split == 'train':
+            # frame, label = self.augment.augment_entire_clip(frame, label)
+            frame, label = self.augment(frame, label)
+        for i in range(len(frame)):
+            if self.split == 'test':
+                curr_frame, curr_label = self.augment(frame[i], label[i], split=self.split)
+            else:
+                curr_frame, curr_label = frame[i], label[i]
+            # if self.split == 'train' and i > 0:
+            #     curr_label = curr_label / 255.
+            #     curr_label[curr_label > 0.5] = 1
+            #     curr_label[curr_label < 0.5] = 0
+            #     # curr_label[(0.05 < curr_label) & (curr_label < 0.95)] = 255
+            #     # we temporarily make it to be hard mask;
+            #     # curr_label = ((curr_label / 255.) - 0.5) * 2
+            #     # curr_label[curr_label >= 0.] = 1.
+            #     # curr_label[curr_label < 0.] = 0.
+            # else:
+            curr_label[curr_label > 0.] = 1.
+            image_batch[i], label_batch[i] = curr_frame, curr_label
+            # image_batch[i], label_batch[i] = self.augment(frame[i], label[i], split=self.split)
+            # note: we simply convert the code to binary mask in v1s, v1m;
+            # to some reason, we failed to load the label in `L' format and had to hardcoding here.
+            # label_batch[i][label_batch[i] > 0.] = 1.
+            # prompts['box_coords'][i], prompts['masks'][i] = self.receive_other_prompts(label_batch[i])
+        # organise the prompts
+        # prompts.update({'masks': torch.stack(prompts['masks'], dim=0)})
+        # prompts.update({'box_coords': torch.stack(prompts['box_coords'], dim=0)})
+        # prompts.update({'point_labels': torch.stack(prompts['point_labels'], dim=0)})
+        prompts.update({'label_index': label_idx})
+        return torch.stack(image_batch, dim=0), torch.stack(label_batch, dim=0), prompts
+    def receive_other_prompts(self, y_):
+        # y_ = torch.zeros_like(y_)
+        if len(torch.unique(y_)) > 1:
+            # foreground point
+            points_foreground = torch.stack(torch.where(y_ > 0)[::-1], dim=0).transpose(1, 0)
+            # bbox prompt (left-top corner & right-bottom corner)
+            bbox_one = torch.min(points_foreground[:, 0]), torch.min(points_foreground[:, 1])
+            bbox_fou = torch.max(points_foreground[:, 0]), torch.max(points_foreground[:, 1])
+            bbox_coord = torch.tensor(bbox_one + bbox_fou, dtype=torch.float)
+            bbox_coord = self.transform_coords(bbox_coord, orig_hw=y_.squeeze().shape)
+            # mask prompt
+            low_mask = torchvision.transforms.functional.resize(y_.clone(), [self.embedding_size*4, self.embedding_size*4],
+                                                                torchvision.transforms.InterpolationMode.NEAREST)
+        else:
+            # for the pure background situation.
+            bbox_coord = torch.zeros([4], dtype=torch.float).fill_(float('nan'))
+            low_mask = torch.zeros([1, self.embedding_size*4, self.embedding_size*4], dtype=torch.float).fill_(float('nan'))
+        return bbox_coord, low_mask
+    # we transfer the coords to SAM's input resolution (1024, 1024).
+    def transform_coords(self, coords: torch.Tensor, orig_hw=None) -> torch.Tensor:
+        """
+        Expects a torch tensor with length 2 in the last dimension. The coordinates can be in absolute image or normalized coordinates,
+        If the coords are in absolute image coordinates, normalize should be set to True and original image size is required.
+        Returns
+            Un-normalized coordinates in the range of [0, 1] which is expected by the sam2 model.
+        """
+        h, w = orig_hw
+        coords = coords.clone().reshape(-1, 2, 2)
+        coords[..., 0] = coords[..., 0] / w
+        coords[..., 1] = coords[..., 1] / h
+        coords = coords * self.image_size  # unnormalize coords
+        return coords.reshape(4)

avs.code/v1m.code/inference.py ADDED Viewed

	@@ -0,0 +1,193 @@

+"""Distributed inference on the test set; runs the same three `process` modes as training validation."""
+import os
+import pathlib
+import torch
+import numpy
+import random
+import argparse
+from easydict import EasyDict
+# Avoid import failure when configs.config creates saved_dir without write permission.
+_real_mkdir = pathlib.Path.mkdir
+def _safe_mkdir(self, mode=0o777, parents=False, exist_ok=False):
+    try:
+        return _real_mkdir(self, mode, parents=parents, exist_ok=exist_ok)
+    except PermissionError:
+        pass
+pathlib.Path.mkdir = _safe_mkdir
+def seed_it(seed):
+    random.seed(seed)
+    os.environ["PYTHONSEED"] = str(seed)
+    numpy.random.seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = True
+    torch.backends.cudnn.enabled = True
+    torch.manual_seed(seed)
+class _DummyTensorboard:
+    """Minimal Tensorboard stub so Trainer.valid runs without wandb logging."""
+    def upload_wandb_info(self, info_dict):
+        pass
+    def upload_wandb_image(self, *args, **kwargs):
+        pass
+def main(local_rank, ngpus_per_node, hyp_param):
+    hyp_param.local_rank = local_rank
+    torch.distributed.init_process_group(
+        backend='nccl',
+        init_method='env://',
+        rank=hyp_param.local_rank,
+        world_size=hyp_param.gpus * 1
+    )
+    seed_it(local_rank + hyp_param.seed)
+    import model.visual.sam2  # noqa: F401 — registers Hydra `configs`
+    from hydra import compose
+    from omegaconf import OmegaConf
+    arch_h = compose(config_name='auralfuser/architecture.yaml')
+    OmegaConf.resolve(arch_h)
+    hyp_param.aural_fuser = OmegaConf.to_container(arch_h.aural_fuser, resolve=True)
+    train_cfg = compose(config_name='training/sam2_training_config.yaml')
+    OmegaConf.resolve(train_cfg)
+    hyp_param.contrastive_learning = OmegaConf.to_container(train_cfg.contrastive_learning, resolve=True)
+    from model.mymodel import AVmodel
+    av_model = AVmodel(hyp_param).cuda()
+    torch.cuda.set_device(hyp_param.local_rank)
+    ckpt_sd = torch.load(hyp_param.inference_ckpt, map_location="cpu")
+    if not isinstance(ckpt_sd, dict):
+        raise TypeError("Checkpoint must be a state_dict dictionary.")
+    # Same as v1s/v2: full-model ckpt vs train-only aural_fuser ckpt (e.g. keys vgg.*, f_blocks.*).
+    if any(k.startswith("v_model.") or k.startswith("aural_fuser.") for k in ckpt_sd.keys()):
+        av_model.load_state_dict(ckpt_sd, strict=True)
+    else:
+        av_model.aural_fuser.load_state_dict(ckpt_sd, strict=True)
+    av_model = torch.nn.parallel.distributed.DistributedDataParallel(av_model, device_ids=[hyp_param.local_rank],
+                                                                     find_unused_parameters=False)
+    av_model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(av_model)
+    av_model.eval()
+    from dataloader.dataset import AV
+    from dataloader.visual.visual_augmentation import Augmentation as VisualAugmentation
+    from dataloader.audio.audio_augmentation import Augmentation as AudioAugmentation
+    from torch.utils.data import DataLoader, Subset
+    from torch.utils.data.distributed import DistributedSampler
+    visual_augmentation = VisualAugmentation(hyp_param.image_mean, hyp_param.image_std,
+                                             hyp_param.image_size, hyp_param.image_size,
+                                             hyp_param.scale_list, ignore_index=hyp_param.ignore_index)
+    audio_augmentation = AudioAugmentation(mono=True)
+    dataset = AV(split='test', augmentation={"visual": visual_augmentation, "audio": audio_augmentation},
+                 param=hyp_param, root_path=hyp_param.data_root_path, data_name=hyp_param.inference_data_name)
+    max_batches = getattr(hyp_param, "inference_max_batches", 0) or 0
+    if max_batches > 0:
+        n_samples = min(max_batches * hyp_param.batch_size, len(dataset))
+        dataset = Subset(dataset, range(n_samples))
+    sampler = DistributedSampler(dataset, shuffle=False)
+    test_dataloader = DataLoader(dataset, batch_size=hyp_param.batch_size, sampler=sampler,
+                                 num_workers=hyp_param.num_workers)
+    from trainer.train import Trainer
+    from utils.foreground_iou import ForegroundIoU
+    from utils.foreground_fscore import ForegroundFScore
+    metrics = {
+        "foreground_iou": ForegroundIoU(),
+        "foreground_f-score": ForegroundFScore(hyp_param.local_rank),
+    }
+    trainer = Trainer(hyp_param, loss=None, tensorboard=_DummyTensorboard(), metrics=metrics)
+    # Same three modes as main.py validation: default first mask / iou_select / iou_occ_select
+    runs = [
+        ("", "default (logits[:,0])"),
+        ("iou_select", "iou_select"),
+        ("iou_occ_select", "iou_occ_select"),
+    ]
+    results = []
+    for process, label in runs:
+        fiou, ffscore = trainer.valid(epoch=0, dataloader=test_dataloader, model=av_model, process=process)
+        results.append((label, fiou, ffscore))
+        torch.cuda.empty_cache()
+    if hyp_param.local_rank <= 0:
+        print("\n========== inference (same three process flags as training valid) ==========")
+        for label, fiou, ffscore in results:
+            print("  {:32s}  f_iou={}  f_f-score={}".format(label, fiou, ffscore))
+        print("=======================================================\n")
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Inference: full test set + three process modes')
+    parser.add_argument('-n', '--nodes', default=1, type=int, metavar='N')
+    parser.add_argument("--local_rank", type=int, default=-1,
+                        help='multi-process training for DDP')
+    parser.add_argument('-g', '--gpus', default=1, type=int,
+                        help='number of gpus per node')
+    parser.add_argument('--batch_size', default=1, type=int,
+                        help='Batch size (match training if needed)')
+    parser.add_argument('--epochs', default=80, type=int,
+                        help="unused")
+    parser.add_argument('--lr', default=1e-5, type=float,
+                        help="unused")
+    parser.add_argument('--online', action="store_true",
+                        help='unused')
+    parser.add_argument(
+        '--inference_ckpt', type=str, default=None,
+        help='Trained AuralSAM2 checkpoint (.pth state_dict). '
+             'SAM2 backbone is loaded from backbone_weight in configs (same path as training: repo_root/ckpts/sam_ckpts/). '
+             'Default if unset: avs.code/training_details/.../hiera_l.pth',
+    )
+    parser.add_argument('--inference_data_name', type=str, default='v1m',
+                        help='AVSBench subset folder label (v1s|v1m|v2); must match training test split')
+    parser.add_argument('--inference_max_batches', type=int, default=0,
+                        help='0 = full test; >0 = first N batches only (debug)')
+    args = parser.parse_args()
+    from configs.config import C
+    args = EasyDict({**C, **vars(args)})
+    _repo = pathlib.Path(__file__).resolve().parent
+    # Repo root: .../AuralSAM2 (parent of avs.code)
+    _workspace = _repo.parent.parent
+    args.data_root_path = str(_workspace / 'AVSBench')
+    args.backbone_weight = str(_workspace / 'ckpts' / 'sam_ckpts' / 'sam2_hiera_large.pt')
+    args.audio.PRETRAINED_VGGISH_MODEL_PATH = str(_workspace / 'ckpts' / 'vggish-10086976.pth')
+    args.saved_dir = '/tmp/v1m_infer_ckpt'
+    pathlib.Path(args.saved_dir).mkdir(parents=True, exist_ok=True)
+    if args.inference_ckpt is None:
+        args.inference_ckpt = str(
+            _repo.parent / 'training_details' / 'v1m' / 'hiera_l' / 'hiera_l.pth'
+        )
+    os.environ['MASTER_ADDR'] = '127.0.0.1'
+    os.environ['MASTER_PORT'] = '9901'
+    torch.multiprocessing.spawn(main, nprocs=args.gpus, args=(args.gpus, args))

avs.code/v1m.code/loss/training/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ """Training loss modules."""
2	+

avs.code/v1m.code/loss/training/contrastive_learning.py ADDED Viewed

	@@ -0,0 +1,201 @@

+from abc import ABC
+import torch
+import torch.nn as nn
+class ContrastLoss(nn.Module, ABC):
+    def __init__(self, hyp_param):
+        super().__init__()
+        self.param = hyp_param
+        _defaults = {
+            "temperature": 0.10,
+            "ignore_idx": 255,
+            "ood_idx": 254,
+            "max_views": 512,
+            "proj_dim": 512,
+            "sample_limits": 128,
+            "total_limits": 15240,
+        }
+        _raw = getattr(hyp_param, "contrastive_learning", None) or {}
+        _cfg = {**_defaults, **_raw}
+        self.temperature = _cfg["temperature"]
+        self.ignore_idx = _cfg["ignore_idx"]
+        self.ood_idx = _cfg["ood_idx"]
+        self.max_views = _cfg["max_views"]
+        self.proj_dim = _cfg["proj_dim"]
+        self.sample_limits = _cfg["sample_limits"]
+        self.total_limits = _cfg["total_limits"]
+    def select_class_wise_samples(self, embeddings, audio_embeddings, predictions, masks, batch_idx):
+        embedding_sample_list = []
+        label_list = []
+        embedding_sample_list_a = []
+        label_list_a = []
+        class_index_list = torch.unique(masks)
+        if len(class_index_list) > 1:
+            for class_index in class_index_list[1:]:
+                embedding_sample_list_a.append(audio_embeddings.unsqueeze(0))
+                label_list_a.append(class_index.unsqueeze(0) + batch_idx * 1e3)
+        else:
+            embedding_sample_list_a.append(audio_embeddings.unsqueeze(0))
+            label_list_a.append(torch.zeros([1], device=embeddings.device) + batch_idx * 1e3)
+        sample_limits = self.sample_limits
+        embeddings = embeddings.permute(1, 0)
+        for class_index in class_index_list:
+            hard_indices = embeddings[((masks != predictions) & (masks == class_index)).nonzero()]
+            easy_indices = embeddings[((masks == predictions) & (masks == class_index)).nonzero()]
+            hard_indices_num, easy_indices_num = hard_indices.shape[0], easy_indices.shape[0]
+            selective_num_hard = min(sample_limits, hard_indices_num)
+            selective_num_easy = min(sample_limits, easy_indices_num)
+            if (selective_num_hard + selective_num_easy) < sample_limits * 2:
+                if selective_num_hard > selective_num_easy:
+                    selective_num_hard += sample_limits * 2 - selective_num_easy
+                else:
+                    selective_num_easy += sample_limits * 2 - selective_num_hard
+            hard_chosen_indices = torch.randperm(hard_indices_num)[:selective_num_hard]
+            embedding_sample_list.append(hard_indices[hard_chosen_indices])
+            label_list.append(masks[hard_chosen_indices] + batch_idx * 1e3)
+            easy_chosen_indices = torch.randperm(easy_indices_num)[:selective_num_easy]
+            embedding_sample_list.append(easy_indices[easy_chosen_indices])
+            label_list.append(masks[easy_chosen_indices] + batch_idx * 1e3)
+        return embedding_sample_list, label_list, embedding_sample_list_a, label_list_a
+    def forward_audio_visual(self, visual_embeddings, audio_embeddings, masks, predictions):
+        masks = masks.flatten(start_dim=1)
+        predictions = predictions.flatten(start_dim=1)
+        visual_embeddings = visual_embeddings.flatten(start_dim=-2)
+        visual_embedding_sample_list = []
+        visual_label_list = []
+        audio_embedding_sample_list = []
+        audio_label_list = []
+        for frame_idx in range(masks.shape[0]):
+            current_vision_feats = visual_embeddings[frame_idx]
+            current_masks = masks[frame_idx]
+            current_predictions = predictions[frame_idx]
+            current_audio_feats = audio_embeddings[frame_idx]
+            for layer_idx in range(3):
+                (
+                    selected_vision_embeddings,
+                    selected_vision_labels,
+                    selected_audio_embeddings,
+                    selected_audio_labels,
+                ) = self.select_class_wise_samples(
+                    current_vision_feats[layer_idx],
+                    current_audio_feats[layer_idx],
+                    current_predictions,
+                    current_masks,
+                    0,
+                )
+                visual_embedding_sample_list += selected_vision_embeddings
+                visual_label_list += selected_vision_labels
+                audio_embedding_sample_list += selected_audio_embeddings
+                audio_label_list += selected_audio_labels
+        if len(visual_embedding_sample_list) == 0:
+            return 0.0
+        visual_embedding_sample_list = torch.cat(visual_embedding_sample_list, dim=0).squeeze()
+        visual_label_list = torch.cat(visual_label_list, dim=0).unsqueeze(-1)
+        audio_embedding_sample_list = torch.cat(audio_embedding_sample_list, dim=0).squeeze()
+        audio_label_list = torch.cat(audio_label_list).unsqueeze(1)
+        total_limits = self.total_limits
+        if visual_embedding_sample_list.shape[0] > total_limits:
+            rand_index = torch.randperm(visual_embedding_sample_list.shape[0])[total_limits]
+            visual_embedding_sample_list = visual_embedding_sample_list[:rand_index]
+            visual_label_list = visual_label_list[:rand_index]
+        loss = self.info_nce(
+            visual_embedding_sample_list,
+            visual_label_list,
+            audio_embedding_sample_list,
+            audio_label_list,
+        )
+        return loss
+    def forward(self, embeddings, output_dicts, masks):
+        predictions = torch.cat([i["multistep_pred_masks"] for i in output_dicts])
+        predictions = torch.nn.functional.interpolate(
+            predictions,
+            size=(int(self.param.image_size / 16), int(self.param.image_size / 16)),
+            mode="bilinear",
+            align_corners=False,
+        ).squeeze()
+        masks = torch.nn.functional.interpolate(
+            masks.unsqueeze(1),
+            size=(int(self.param.image_size / 16), int(self.param.image_size / 16)),
+            mode="nearest",
+        ).squeeze()
+        visual_embeddings, audio_embeddings = embeddings
+        visual_embeddings = torch.cat(
+            [
+                torch.cat(
+                    [
+                        visual_embeddings[0][i].unsqueeze(0),
+                        visual_embeddings[1][i].unsqueeze(0),
+                        visual_embeddings[2][i].unsqueeze(0),
+                    ]
+                ).unsqueeze(0)
+                for i in range(masks.shape[0])
+            ]
+        )
+        audio_embeddings = torch.cat(
+            [
+                torch.cat(
+                    [
+                        audio_embeddings[0][i].unsqueeze(0),
+                        audio_embeddings[1][i].unsqueeze(0),
+                        audio_embeddings[2][i].unsqueeze(0),
+                    ]
+                ).unsqueeze(0)
+                for i in range(masks.shape[0])
+            ]
+        )
+        return self.forward_audio_visual(
+            visual_embeddings, audio_embeddings.squeeze(), masks, predictions
+        )
+    @staticmethod
+    def manipulate_cover_mask(a_label, current_mask):
+        a_label = a_label + 1
+        visual_mask = torch.matmul(a_label, torch.transpose(a_label, 0, 1))
+        current_mask[: visual_mask.shape[1], : visual_mask.shape[0]][visual_mask == 1.0] = 0
+        current_mask[: visual_mask.shape[1], : visual_mask.shape[0]][visual_mask == 4.0] = 0
+        return current_mask
+    def info_nce(self, anchors_, a_labels_, contras_, c_labels_):
+        c_labels_ = torch.cat([a_labels_, c_labels_])
+        contras_ = torch.cat([anchors_, contras_])
+        mask = torch.eq(a_labels_, torch.transpose(c_labels_, 0, 1)).float()
+        anchor_dot_contrast = torch.div(
+            torch.matmul(anchors_, torch.transpose(contras_, 0, 1)),
+            self.temperature,
+        )
+        logits_max, _ = torch.max(anchor_dot_contrast, dim=1, keepdim=True)
+        logits = anchor_dot_contrast - logits_max.detach()
+        neg_mask = 1 - mask
+        mask = self.manipulate_cover_mask(a_label=a_labels_, current_mask=mask)
+        mask = mask.fill_diagonal_(0.0)
+        neg_logits = torch.exp(logits) * neg_mask
+        neg_logits = neg_logits.sum(1, keepdim=True)
+        exp_logits = torch.exp(logits)
+        log_prob = logits - torch.log(exp_logits + neg_logits)
+        mask_pos_pairs = mask.sum(1)
+        mask_pos_pairs = torch.where(mask_pos_pairs < 1e-6, 1, mask_pos_pairs)
+        mean_log_prob_pos = (mask * log_prob).sum(1) / mask_pos_pairs
+        assert not torch.isnan(mean_log_prob_pos).any(), print(torch.isnan(log_prob).any())
+        return -mean_log_prob_pos.mean()

avs.code/v1m.code/loss/training/sam2_training_loss.py ADDED Viewed

	@@ -0,0 +1,220 @@

+from collections import defaultdict
+from typing import Dict, List
+import torch
+import torch.distributed
+import torch.nn as nn
+import torch.nn.functional as F
+CORE_LOSS_KEY = "core_loss"
+def dice_loss(inputs, targets, num_objects, loss_on_multimask=False):
+    inputs = inputs.sigmoid()
+    if loss_on_multimask:
+        assert inputs.dim() == 4 and targets.dim() == 4
+        inputs = inputs.flatten(2)
+        targets = targets.flatten(2)
+        numerator = 2 * (inputs * targets).sum(-1)
+    else:
+        inputs = inputs.flatten(1)
+        numerator = 2 * (inputs * targets).sum(1)
+    denominator = inputs.sum(-1) + targets.sum(-1)
+    loss = 1 - (numerator + 1) / (denominator + 1)
+    if loss_on_multimask:
+        return loss / num_objects
+    return loss.sum() / num_objects
+def sigmoid_focal_loss(
+    inputs,
+    targets,
+    num_objects,
+    alpha: float = 0.25,
+    gamma: float = 2,
+    loss_on_multimask=False,
+):
+    prob = inputs.sigmoid()
+    ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+    p_t = prob * targets + (1 - prob) * (1 - targets)
+    loss = ce_loss * ((1 - p_t) ** gamma)
+    if alpha >= 0:
+        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
+        loss = alpha_t * loss
+    if loss_on_multimask:
+        assert loss.dim() == 4
+        return loss.flatten(2).mean(-1) / num_objects
+    return loss.mean(1).sum() / num_objects
+def iou_loss(
+    inputs, targets, pred_ious, num_objects, loss_on_multimask=False, use_l1_loss=False
+):
+    assert inputs.dim() == 4 and targets.dim() == 4
+    pred_mask = inputs.flatten(2) > 0
+    gt_mask = targets.flatten(2) > 0
+    area_i = torch.sum(pred_mask & gt_mask, dim=-1).float()
+    area_u = torch.sum(pred_mask | gt_mask, dim=-1).float()
+    actual_ious = area_i / torch.clamp(area_u, min=1.0)
+    if use_l1_loss:
+        loss = F.l1_loss(pred_ious, actual_ious, reduction="none")
+    else:
+        loss = F.mse_loss(pred_ious, actual_ious, reduction="none")
+    if loss_on_multimask:
+        return loss / num_objects
+    return loss.sum() / num_objects
+class MultiStepMultiMasksAndIous(nn.Module):
+    def __init__(
+        self,
+        weight_dict,
+        focal_alpha=0.25,
+        focal_gamma=2,
+        supervise_all_iou=False,
+        iou_use_l1_loss=False,
+        pred_obj_scores=False,
+        focal_gamma_obj_score=0.0,
+        focal_alpha_obj_score=-1,
+        gpu_num=1,
+    ):
+        super().__init__()
+        self.weight_dict = weight_dict
+        self.focal_alpha = focal_alpha
+        self.focal_gamma = focal_gamma
+        self.world_size = gpu_num
+        assert "loss_mask" in self.weight_dict
+        assert "loss_dice" in self.weight_dict
+        assert "loss_iou" in self.weight_dict
+        if "loss_class" not in self.weight_dict:
+            self.weight_dict["loss_class"] = 0.0
+        self.focal_alpha_obj_score = focal_alpha_obj_score
+        self.focal_gamma_obj_score = focal_gamma_obj_score
+        self.supervise_all_iou = supervise_all_iou
+        self.iou_use_l1_loss = iou_use_l1_loss
+        self.pred_obj_scores = pred_obj_scores
+    def forward(self, outs_batch: List[Dict], targets_batch: torch.Tensor):
+        assert len(outs_batch) == len(targets_batch)
+        num_objects = torch.tensor(
+            targets_batch.shape[1], device=targets_batch.device, dtype=torch.float
+        )
+        torch.distributed.all_reduce(num_objects)
+        num_objects = torch.clamp(num_objects / self.world_size, min=1).item()
+        losses = defaultdict(int)
+        for outs, targets in zip(outs_batch, targets_batch):
+            cur_losses = self._forward(outs, targets, num_objects)
+            for k, v in cur_losses.items():
+                losses[k] += v
+        return losses
+    def _forward(self, outputs: Dict, targets: torch.Tensor, num_objects):
+        target_masks = targets.unsqueeze(1).float()
+        assert target_masks.dim() == 4
+        src_masks_list = outputs["multistep_pred_multimasks_high_res"]
+        ious_list = outputs["multistep_pred_ious"]
+        object_score_logits_list = outputs["multistep_object_score_logits"]
+        assert len(src_masks_list) == len(ious_list)
+        assert len(object_score_logits_list) == len(ious_list)
+        losses = {"loss_mask": 0, "loss_dice": 0, "loss_iou": 0, "loss_class": 0}
+        for src_masks, ious, object_score_logits in zip(
+            src_masks_list, ious_list, object_score_logits_list
+        ):
+            self._update_losses(
+                losses, src_masks, target_masks, ious, num_objects, object_score_logits
+            )
+        losses[CORE_LOSS_KEY] = self.reduce_loss(losses)
+        return losses
+    def _update_losses(
+        self, losses, src_masks, target_masks, ious, num_objects, object_score_logits
+    ):
+        target_masks = target_masks.expand_as(src_masks)
+        loss_multimask = sigmoid_focal_loss(
+            src_masks,
+            target_masks,
+            num_objects,
+            alpha=self.focal_alpha,
+            gamma=self.focal_gamma,
+            loss_on_multimask=True,
+        )
+        loss_multidice = dice_loss(
+            src_masks, target_masks, num_objects, loss_on_multimask=True
+        )
+        if not self.pred_obj_scores:
+            loss_class = torch.tensor(
+                0.0, dtype=loss_multimask.dtype, device=loss_multimask.device
+            )
+            target_obj = torch.ones(
+                loss_multimask.shape[0],
+                1,
+                dtype=loss_multimask.dtype,
+                device=loss_multimask.device,
+            )
+        else:
+            target_obj = torch.any((target_masks[:, 0] > 0).flatten(1), dim=-1)[
+                ..., None
+            ].float()
+            loss_class = sigmoid_focal_loss(
+                object_score_logits,
+                target_obj,
+                num_objects,
+                alpha=self.focal_alpha_obj_score,
+                gamma=self.focal_gamma_obj_score,
+            )
+        loss_multiiou = iou_loss(
+            src_masks,
+            target_masks,
+            ious,
+            num_objects,
+            loss_on_multimask=True,
+            use_l1_loss=self.iou_use_l1_loss,
+        )
+        assert loss_multimask.dim() == 2
+        assert loss_multidice.dim() == 2
+        assert loss_multiiou.dim() == 2
+        if loss_multimask.size(1) > 1:
+            loss_combo = (
+                loss_multimask * self.weight_dict["loss_mask"]
+                + loss_multidice * self.weight_dict["loss_dice"]
+            )
+            best_loss_inds = torch.argmin(loss_combo, dim=-1)
+            batch_inds = torch.arange(loss_combo.size(0), device=loss_combo.device)
+            loss_mask = loss_multimask[batch_inds, best_loss_inds].unsqueeze(1)
+            loss_dice = loss_multidice[batch_inds, best_loss_inds].unsqueeze(1)
+            if self.supervise_all_iou:
+                loss_iou = loss_multiiou.mean(dim=-1).unsqueeze(1)
+            else:
+                loss_iou = loss_multiiou[batch_inds, best_loss_inds].unsqueeze(1)
+        else:
+            loss_mask = loss_multimask
+            loss_dice = loss_multidice
+            loss_iou = loss_multiiou
+        loss_mask = loss_mask * target_obj
+        loss_dice = loss_dice * target_obj
+        loss_iou = loss_iou * target_obj
+        losses["loss_mask"] += loss_mask.sum()
+        losses["loss_dice"] += loss_dice.sum()
+        losses["loss_iou"] += loss_iou.sum()
+        losses["loss_class"] += loss_class
+    def reduce_loss(self, losses):
+        reduced_loss = 0.0
+        for loss_key, weight in self.weight_dict.items():
+            if loss_key not in losses:
+                raise ValueError(f"{type(self)} doesn't compute {loss_key}")
+            if weight != 0:
+                reduced_loss += losses[loss_key] * weight
+        return reduced_loss

avs.code/v1m.code/main.py ADDED Viewed

	@@ -0,0 +1,166 @@

+"""DDP training entry: AV model with SAM2 frozen, AuralFuser trainable, Hydra transforms and loss."""
+import os
+import torch
+import numpy
+import random
+import argparse
+from easydict import EasyDict
+def seed_it(seed):
+    """Fix RNGs and cuDNN for reproducible runs (rank offsets seed in DDP)."""
+    os.environ["PYTHONSEED"] = str(seed)
+    random.seed(seed)
+    numpy.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.enabled = True
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+def main(local_rank, ngpus_per_node, hyp_param):
+    hyp_param.local_rank = local_rank
+    # NCCL process group; world size = GPUs on this node
+    torch.distributed.init_process_group(
+        backend='nccl',
+        init_method='env://',
+        rank=hyp_param.local_rank,
+        world_size=hyp_param.gpus * 1
+    )
+    seed_it(local_rank + hyp_param.seed)
+    torch.cuda.set_device(hyp_param.local_rank)
+    import model.visual.sam2  # noqa: F401 — registers Hydra `configs` (initialize_config_module)
+    from hydra import compose
+    from hydra.utils import instantiate
+    from omegaconf import OmegaConf
+    # Hydra configs under v1m.code/configs (same pattern as training/sam2_training_config.yaml)
+    transform_config_path = 'training/sam2_training_config.yaml'
+    if 'hiera_t' in hyp_param.sam_config_path:
+        hyp_param.image_size = 224
+        hyp_param.image_embedding_size = int(hyp_param.image_size / 16)
+        print('\n upload image size to be {}x{} \n'.format(224, 224), flush=True)
+    cfg = compose(config_name=transform_config_path)
+    OmegaConf.resolve(cfg)
+    hyp_param.contrastive_learning = OmegaConf.to_container(cfg.contrastive_learning, resolve=True)
+    arch_h = compose(config_name='auralfuser/architecture.yaml')
+    OmegaConf.resolve(arch_h)
+    hyp_param.aural_fuser = OmegaConf.to_container(arch_h.aural_fuser, resolve=True)
+    from model.mymodel import AVmodel
+    av_model = AVmodel(hyp_param).cuda(hyp_param.local_rank)
+    av_model = torch.nn.parallel.distributed.DistributedDataParallel(av_model, device_ids=[hyp_param.local_rank],
+                                                                     find_unused_parameters=True)
+    # Optimizer: parameter groups from AuralFuser only (train_* vs VGG backbone)
+    from utils.utils import manipulate_params
+    parameter_list = manipulate_params(hyp_param, av_model.module.aural_fuser)
+    optimiser = torch.optim.AdamW(parameter_list, betas=(0.9, 0.999))
+    from dataloader.dataset import AV
+    from dataloader.visual.visual_augmentation import Augmentation as VisualAugmentation
+    from dataloader.audio.audio_augmentation import Augmentation as AudioAugmentation
+    from torch.utils.data.distributed import DistributedSampler
+    compose_api = instantiate(cfg.train_transforms, _recursive_=True)[0]
+    audio_augmentation = AudioAugmentation(mono=True)
+    train_dataset = AV(split='train', augmentation={"visual": compose_api, "audio": audio_augmentation},
+                       param=hyp_param, root_path=hyp_param.data_root_path, data_name=hyp_param.data_name)
+    visual_augmentation = VisualAugmentation(hyp_param.image_mean, hyp_param.image_std,
+                                             hyp_param.image_size, hyp_param.image_size,
+                                             hyp_param.scale_list, ignore_index=hyp_param.ignore_index)
+    audio_augmentation = AudioAugmentation(mono=True)
+    random_sampler = DistributedSampler(train_dataset, shuffle=True)
+    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=hyp_param.batch_size,
+                                                   sampler=random_sampler,
+                                                   num_workers=hyp_param.num_workers, drop_last=True)
+    test_dataset = AV(split='test', augmentation={"visual": visual_augmentation, "audio": audio_augmentation},
+                      param=hyp_param, root_path=hyp_param.data_root_path, data_name=hyp_param.data_name)
+    order_sampler = DistributedSampler(test_dataset, shuffle=False)
+    test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=1, sampler=order_sampler,
+                                                  num_workers=hyp_param.num_workers)
+    criterion = instantiate(cfg.loss, _recursive_=True)['all']
+    from utils.tensorboard import Tensorboard
+    tensorboard = Tensorboard(config=hyp_param) if hyp_param.local_rank <= 0 else None
+    from trainer.train import Trainer
+    from utils.foreground_iou import ForegroundIoU
+    from utils.foreground_fscore import ForegroundFScore
+    metrics = {"foreground_iou": ForegroundIoU(), "foreground_f-score": ForegroundFScore(0 if hyp_param.local_rank <= 0 else hyp_param.local_rank)}
+    trainer = Trainer(hyp_param, loss=criterion, tensorboard=tensorboard, metrics=metrics)
+    curr_best = 0.  # checkpoint when IoU (iou_select mode) improves
+    for epoch in range(hyp_param.epochs):
+        av_model.train()
+        av_model.module.freeze_sam_parameters()
+        random_sampler.set_epoch(epoch)
+        trainer.train(epoch=epoch, dataloader=train_dataloader, model=av_model, optimiser=optimiser)
+        torch.distributed.barrier()
+        torch.cuda.empty_cache()
+        av_model.eval()
+        # Three validation modes: default first mask / IoU-selected mask / IoU + objectness gate
+        curr_results1, _ = trainer.valid(epoch=epoch, dataloader=test_dataloader, model=av_model, process='first_index')
+        curr_results, _ = trainer.valid(epoch=epoch, dataloader=test_dataloader, model=av_model, process='iou_select')
+        curr_results3, _ = trainer.valid(epoch=epoch, dataloader=test_dataloader, model=av_model, process='iou_occ_select')
+        if hyp_param.local_rank <= 0 and curr_results > curr_best:
+            curr_best = curr_results
+            torch.save(av_model.module.aural_fuser.state_dict(), os.path.join(hyp_param.saved_dir, str(curr_results) + ".pth"))
+        torch.distributed.barrier()
+        torch.cuda.empty_cache()
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='PyTorch Training')
+    parser.add_argument('-n', '--nodes', default=1, type=int, metavar='N')
+    parser.add_argument("--local_rank", type=int, default=-1,
+                        help='multi-process training for DDP')
+    parser.add_argument('-g', '--gpus', default=1, type=int,
+                        help='number of gpus per node')
+    parser.add_argument('--batch_size', default=1, type=int)
+    parser.add_argument('--epochs', default=80, type=int,
+                        help="total epochs that used for the training")
+    parser.add_argument('--lr', default=1e-4, type=float,
+                        help='Default HEAD Learning rate is same as others, '
+                             '*Note: in ddp training, lr will automatically times by n_gpu')
+    parser.add_argument('--online', action="store_true",
+                        help='switch on for visualization; switch off for debug')
+    args = parser.parse_args()
+    from configs.config import C
+    args = EasyDict({**C, **vars(args)})
+    os.environ['MASTER_ADDR'] = '127.0.0.1'
+    os.environ['MASTER_PORT'] = '9902'
+    torch.multiprocessing.spawn(main, nprocs=args.gpus, args=(args.gpus, args))

avs.code/v1m.code/model/audio/torchvggish/mel_features.py ADDED Viewed

	@@ -0,0 +1,223 @@

+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Defines routines to compute mel spectrogram features from audio waveform."""
+import numpy as np
+def frame(data, window_length, hop_length):
+  """Convert array into a sequence of successive possibly overlapping frames.
+  An n-dimensional array of shape (num_samples, ...) is converted into an
+  (n+1)-D array of shape (num_frames, window_length, ...), where each frame
+  starts hop_length points after the preceding one.
+  This is accomplished using stride_tricks, so the original data is not
+  copied.  However, there is no zero-padding, so any incomplete frames at the
+  end are not included.
+  Args:
+    data: np.array of dimension N >= 1.
+    window_length: Number of samples in each frame.
+    hop_length: Advance (in samples) between each window.
+  Returns:
+    (N+1)-D np.array with as many rows as there are complete frames that can be
+    extracted.
+  """
+  num_samples = data.shape[0]
+  num_frames = 1 + int(np.floor((num_samples - window_length) / hop_length))
+  shape = (num_frames, window_length) + data.shape[1:]
+  strides = (data.strides[0] * hop_length,) + data.strides
+  return np.lib.stride_tricks.as_strided(data, shape=shape, strides=strides)
+def periodic_hann(window_length):
+  """Calculate a "periodic" Hann window.
+  The classic Hann window is defined as a raised cosine that starts and
+  ends on zero, and where every value appears twice, except the middle
+  point for an odd-length window.  Matlab calls this a "symmetric" window
+  and np.hanning() returns it.  However, for Fourier analysis, this
+  actually represents just over one cycle of a period N-1 cosine, and
+  thus is not compactly expressed on a length-N Fourier basis.  Instead,
+  it's better to use a raised cosine that ends just before the final
+  zero value - i.e. a complete cycle of a period-N cosine.  Matlab
+  calls this a "periodic" window. This routine calculates it.
+  Args:
+    window_length: The number of points in the returned window.
+  Returns:
+    A 1D np.array containing the periodic hann window.
+  """
+  return 0.5 - (0.5 * np.cos(2 * np.pi / window_length *
+                             np.arange(window_length)))
+def stft_magnitude(signal, fft_length,
+                   hop_length=None,
+                   window_length=None):
+  """Calculate the short-time Fourier transform magnitude.
+  Args:
+    signal: 1D np.array of the input time-domain signal.
+    fft_length: Size of the FFT to apply.
+    hop_length: Advance (in samples) between each frame passed to FFT.
+    window_length: Length of each block of samples to pass to FFT.
+  Returns:
+    2D np.array where each row contains the magnitudes of the fft_length/2+1
+    unique values of the FFT for the corresponding frame of input samples.
+  """
+  frames = frame(signal, window_length, hop_length)
+  # Apply frame window to each frame. We use a periodic Hann (cosine of period
+  # window_length) instead of the symmetric Hann of np.hanning (period
+  # window_length-1).
+  window = periodic_hann(window_length)
+  windowed_frames = frames * window
+  return np.abs(np.fft.rfft(windowed_frames, int(fft_length)))
+# Mel spectrum constants and functions.
+_MEL_BREAK_FREQUENCY_HERTZ = 700.0
+_MEL_HIGH_FREQUENCY_Q = 1127.0
+def hertz_to_mel(frequencies_hertz):
+  """Convert frequencies to mel scale using HTK formula.
+  Args:
+    frequencies_hertz: Scalar or np.array of frequencies in hertz.
+  Returns:
+    Object of same size as frequencies_hertz containing corresponding values
+    on the mel scale.
+  """
+  return _MEL_HIGH_FREQUENCY_Q * np.log(
+      1.0 + (frequencies_hertz / _MEL_BREAK_FREQUENCY_HERTZ))
+def spectrogram_to_mel_matrix(num_mel_bins=20,
+                              num_spectrogram_bins=129,
+                              audio_sample_rate=8000,
+                              lower_edge_hertz=125.0,
+                              upper_edge_hertz=3800.0):
+  """Return a matrix that can post-multiply spectrogram rows to make mel.
+  Returns a np.array matrix A that can be used to post-multiply a matrix S of
+  spectrogram values (STFT magnitudes) arranged as frames x bins to generate a
+  "mel spectrogram" M of frames x num_mel_bins.  M = S A.
+  The classic HTK algorithm exploits the complementarity of adjacent mel bands
+  to multiply each FFT bin by only one mel weight, then add it, with positive
+  and negative signs, to the two adjacent mel bands to which that bin
+  contributes.  Here, by expressing this operation as a matrix multiply, we go
+  from num_fft multiplies per frame (plus around 2*num_fft adds) to around
+  num_fft^2 multiplies and adds.  However, because these are all presumably
+  accomplished in a single call to np.dot(), it's not clear which approach is
+  faster in Python.  The matrix multiplication has the attraction of being more
+  general and flexible, and much easier to read.
+  Args:
+    num_mel_bins: How many bands in the resulting mel spectrum.  This is
+      the number of columns in the output matrix.
+    num_spectrogram_bins: How many bins there are in the source spectrogram
+      data, which is understood to be fft_size/2 + 1, i.e. the spectrogram
+      only contains the nonredundant FFT bins.
+    audio_sample_rate: Samples per second of the audio at the input to the
+      spectrogram. We need this to figure out the actual frequencies for
+      each spectrogram bin, which dictates how they are mapped into mel.
+    lower_edge_hertz: Lower bound on the frequencies to be included in the mel
+      spectrum.  This corresponds to the lower edge of the lowest triangular
+      band.
+    upper_edge_hertz: The desired top edge of the highest frequency band.
+  Returns:
+    An np.array with shape (num_spectrogram_bins, num_mel_bins).
+  Raises:
+    ValueError: if frequency edges are incorrectly ordered or out of range.
+  """
+  nyquist_hertz = audio_sample_rate / 2.
+  if lower_edge_hertz < 0.0:
+    raise ValueError("lower_edge_hertz %.1f must be >= 0" % lower_edge_hertz)
+  if lower_edge_hertz >= upper_edge_hertz:
+    raise ValueError("lower_edge_hertz %.1f >= upper_edge_hertz %.1f" %
+                     (lower_edge_hertz, upper_edge_hertz))
+  if upper_edge_hertz > nyquist_hertz:
+    raise ValueError("upper_edge_hertz %.1f is greater than Nyquist %.1f" %
+                     (upper_edge_hertz, nyquist_hertz))
+  spectrogram_bins_hertz = np.linspace(0.0, nyquist_hertz, num_spectrogram_bins)
+  spectrogram_bins_mel = hertz_to_mel(spectrogram_bins_hertz)
+  # The i'th mel band (starting from i=1) has center frequency
+  # band_edges_mel[i], lower edge band_edges_mel[i-1], and higher edge
+  # band_edges_mel[i+1].  Thus, we need num_mel_bins + 2 values in
+  # the band_edges_mel arrays.
+  band_edges_mel = np.linspace(hertz_to_mel(lower_edge_hertz),
+                               hertz_to_mel(upper_edge_hertz), num_mel_bins + 2)
+  # Matrix to post-multiply feature arrays whose rows are num_spectrogram_bins
+  # of spectrogram values.
+  mel_weights_matrix = np.empty((num_spectrogram_bins, num_mel_bins))
+  for i in range(num_mel_bins):
+    lower_edge_mel, center_mel, upper_edge_mel = band_edges_mel[i:i + 3]
+    # Calculate lower and upper slopes for every spectrogram bin.
+    # Line segments are linear in the *mel* domain, not hertz.
+    lower_slope = ((spectrogram_bins_mel - lower_edge_mel) /
+                   (center_mel - lower_edge_mel))
+    upper_slope = ((upper_edge_mel - spectrogram_bins_mel) /
+                   (upper_edge_mel - center_mel))
+    # .. then intersect them with each other and zero.
+    mel_weights_matrix[:, i] = np.maximum(0.0, np.minimum(lower_slope,
+                                                          upper_slope))
+  # HTK excludes the spectrogram DC bin; make sure it always gets a zero
+  # coefficient.
+  mel_weights_matrix[0, :] = 0.0
+  return mel_weights_matrix
+def log_mel_spectrogram(data,
+                        audio_sample_rate=8000,
+                        log_offset=0.0,
+                        window_length_secs=0.025,
+                        hop_length_secs=0.010,
+                        **kwargs):
+  """Convert waveform to a log magnitude mel-frequency spectrogram.
+  Args:
+    data: 1D np.array of waveform data.
+    audio_sample_rate: The sampling rate of data.
+    log_offset: Add this to values when taking log to avoid -Infs.
+    window_length_secs: Duration of each window to analyze.
+    hop_length_secs: Advance between successive analysis windows.
+    **kwargs: Additional arguments to pass to spectrogram_to_mel_matrix.
+  Returns:
+    2D np.array of (num_frames, num_mel_bins) consisting of log mel filterbank
+    magnitudes for successive frames.
+  """
+  window_length_samples = int(round(audio_sample_rate * window_length_secs))
+  hop_length_samples = int(round(audio_sample_rate * hop_length_secs))
+  fft_length = 2 ** int(np.ceil(np.log(window_length_samples) / np.log(2.0)))
+  spectrogram = stft_magnitude(
+      data,
+      fft_length=fft_length,
+      hop_length=hop_length_samples,
+      window_length=window_length_samples)
+  mel_spectrogram = np.dot(spectrogram, spectrogram_to_mel_matrix(
+      num_spectrogram_bins=spectrogram.shape[1],
+      audio_sample_rate=audio_sample_rate, **kwargs))
+  return np.log(mel_spectrogram + log_offset)

avs.code/v1m.code/model/audio/torchvggish/vggish.py ADDED Viewed

	@@ -0,0 +1,193 @@

+import numpy as np
+import torch
+import torch.nn as nn
+from torch import hub
+from . import vggish_input, vggish_params
+class VGG(nn.Module):
+    def __init__(self, features):
+        super(VGG, self).__init__()
+        self.features = features
+        self.embeddings = nn.Sequential(
+            nn.Linear(512 * 4 * 6, 4096),
+            nn.ReLU(True),
+            nn.Linear(4096, 4096),
+            nn.ReLU(True),
+            nn.Linear(4096, 128),
+            nn.ReLU(True))
+    def forward(self, x):
+        x = self.features(x)
+        # Transpose the output from features to
+        # remain compatible with vggish embeddings
+        x = torch.transpose(x, 1, 3)
+        x = torch.transpose(x, 1, 2)
+        x = x.contiguous()
+        x = x.view(x.size(0), -1)
+        return self.embeddings(x)
+class Postprocessor(nn.Module):
+    """Post-processes VGGish embeddings. Returns a torch.Tensor instead of a
+    numpy array in order to preserve the gradient.
+    "The initial release of AudioSet included 128-D VGGish embeddings for each
+    segment of AudioSet. These released embeddings were produced by applying
+    a PCA transformation (technically, a whitening transform is included as well)
+    and 8-bit quantization to the raw embedding output from VGGish, in order to
+    stay compatible with the YouTube-8M project which provides visual embeddings
+    in the same format for a large set of YouTube videos. This class implements
+    the same PCA (with whitening) and quantization transformations."
+    """
+    def __init__(self):
+        """Constructs a postprocessor."""
+        super(Postprocessor, self).__init__()
+        # Create empty matrix, for user's state_dict to load
+        self.pca_eigen_vectors = torch.empty(
+            (vggish_params.EMBEDDING_SIZE, vggish_params.EMBEDDING_SIZE,),
+            dtype=torch.float,
+        )
+        self.pca_means = torch.empty(
+            (vggish_params.EMBEDDING_SIZE, 1), dtype=torch.float
+        )
+        self.pca_eigen_vectors = nn.Parameter(self.pca_eigen_vectors, requires_grad=False)
+        self.pca_means = nn.Parameter(self.pca_means, requires_grad=False)
+    def postprocess(self, embeddings_batch):
+        """Applies tensor postprocessing to a batch of embeddings.
+        Args:
+          embeddings_batch: An tensor of shape [batch_size, embedding_size]
+            containing output from the embedding layer of VGGish.
+        Returns:
+          A tensor of the same shape as the input, containing the PCA-transformed,
+          quantized, and clipped version of the input.
+        """
+        assert len(embeddings_batch.shape) == 2, "Expected 2-d batch, got %r" % (
+            embeddings_batch.shape,
+        )
+        assert (
+                embeddings_batch.shape[1] == vggish_params.EMBEDDING_SIZE
+        ), "Bad batch shape: %r" % (embeddings_batch.shape,)
+        # Apply PCA.
+        # - Embeddings come in as [batch_size, embedding_size].
+        # - Transpose to [embedding_size, batch_size].
+        # - Subtract pca_means column vector from each column.
+        # - Premultiply by PCA matrix of shape [output_dims, input_dims]
+        #   where both are are equal to embedding_size in our case.
+        # - Transpose result back to [batch_size, embedding_size].
+        pca_applied = torch.mm(self.pca_eigen_vectors, (embeddings_batch.t() - self.pca_means)).t()
+        # Quantize by:
+        # - clipping to [min, max] range
+        clipped_embeddings = torch.clamp(
+            pca_applied, vggish_params.QUANTIZE_MIN_VAL, vggish_params.QUANTIZE_MAX_VAL
+        )
+        # - convert to 8-bit in range [0.0, 255.0]
+        quantized_embeddings = torch.round(
+            (clipped_embeddings - vggish_params.QUANTIZE_MIN_VAL)
+            * (
+                    255.0
+                    / (vggish_params.QUANTIZE_MAX_VAL - vggish_params.QUANTIZE_MIN_VAL)
+            )
+        )
+        return torch.squeeze(quantized_embeddings)
+    def forward(self, x):
+        return self.postprocess(x)
+def make_layers():
+    layers = []
+    in_channels = 1
+    for v in [64, "M", 128, "M", 256, 256, "M", 512, 512, "M"]:
+        if v == "M":
+            layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
+        else:
+            conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
+            layers += [conv2d, nn.ReLU(inplace=True)]
+            in_channels = v
+    return nn.Sequential(*layers)
+def _vgg():
+    return VGG(make_layers())
+# def _spectrogram():
+#     config = dict(
+#         sr=16000,
+#         n_fft=400,
+#         n_mels=64,
+#         hop_length=160,
+#         window="hann",
+#         center=False,
+#         pad_mode="reflect",
+#         htk=True,
+#         fmin=125,
+#         fmax=7500,
+#         output_format='Magnitude',
+#         #             device=device,
+#     )
+#     return Spectrogram.MelSpectrogram(**config)
+class VGGish(VGG):
+    def __init__(self, cfg, device=None):
+        super().__init__(make_layers())
+        if cfg.FREEZE_AUDIO_EXTRACTOR:
+            state_dict = torch.load(cfg.PRETRAINED_VGGISH_MODEL_PATH)
+            super().load_state_dict(state_dict)
+            print(f'==> Load pretrained VGGish parameters from {cfg.PRETRAINED_VGGISH_MODEL_PATH}')
+        if device is None:
+            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+            print("device: ", device)
+        self.device = device
+        self.preprocess = cfg.PREPROCESS_AUDIO_TO_LOG_MEL
+        self.postprocess = cfg.POSTPROCESS_LOG_MEL_WITH_PCA
+        if self.postprocess:
+            self.pproc = Postprocessor()
+            if cfg.FREEZE_AUDIO_EXTRACTOR:
+                state_dict = torch.load(cfg.PRETRAINED_PCA_PARAMS_PATH)
+                # TODO: Convert the state_dict to torch
+                state_dict[vggish_params.PCA_EIGEN_VECTORS_NAME] = torch.as_tensor(
+                    state_dict[vggish_params.PCA_EIGEN_VECTORS_NAME], dtype=torch.float
+                )
+                state_dict[vggish_params.PCA_MEANS_NAME] = torch.as_tensor(
+                    state_dict[vggish_params.PCA_MEANS_NAME].reshape(-1, 1), dtype=torch.float
+                )
+                self.pproc.load_state_dict(state_dict)
+        self.to(self.device)
+    def forward(self, x):
+        if self.preprocess:
+            print(">>> pre processing...")
+            x = self._preprocess(x)
+            x = x.to(self.device)
+        x = VGG.forward(self, x)
+        if self.postprocess:
+            print(">>> post processing...")
+            x = self._postprocess(x)
+        return x
+    def _preprocess(self, x):
+        # if isinstance(x, np.ndarray):
+        #     x = vggish_input.waveform_to_examples(x, fs)
+        if isinstance(x, str):
+            x = vggish_input.wavfile_to_examples(x)
+        else:
+            raise AttributeError
+        return x
+    def _postprocess(self, x):
+        return self.pproc(x)

avs.code/v1m.code/model/audio/torchvggish/vggish_input.py ADDED Viewed

	@@ -0,0 +1,98 @@

+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Compute input examples for VGGish from audio waveform."""
+# Modification: Return torch tensors rather than numpy arrays
+import torch
+import numpy as np
+import resampy
+from . import mel_features
+from . import vggish_params
+import soundfile as sf
+def waveform_to_examples(data, sample_rate, return_tensor=True):
+    """Converts audio waveform into an array of examples for VGGish.
+  Args:
+    data: np.array of either one dimension (mono) or two dimensions
+      (multi-channel, with the outer dimension representing channels).
+      Each sample is generally expected to lie in the range [-1.0, +1.0],
+      although this is not required.
+    sample_rate: Sample rate of data.
+    return_tensor: Return data as a Pytorch tensor ready for VGGish
+  Returns:
+    3-D np.array of shape [num_examples, num_frames, num_bands] which represents
+    a sequence of examples, each of which contains a patch of log mel
+    spectrogram, covering num_frames frames of audio and num_bands mel frequency
+    bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS.
+  """
+    # Convert to mono.
+    if len(data.shape) > 1:
+        data = np.mean(data, axis=1)
+    # Resample to the rate assumed by VGGish.
+    if sample_rate != vggish_params.SAMPLE_RATE:
+        data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE)
+    # Compute log mel spectrogram features.
+    log_mel = mel_features.log_mel_spectrogram(
+        data,
+        audio_sample_rate=vggish_params.SAMPLE_RATE,
+        log_offset=vggish_params.LOG_OFFSET,
+        window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS,
+        hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS,
+        num_mel_bins=vggish_params.NUM_MEL_BINS,
+        lower_edge_hertz=vggish_params.MEL_MIN_HZ,
+        upper_edge_hertz=vggish_params.MEL_MAX_HZ)
+    # Frame features into examples.
+    features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS
+    example_window_length = int(round(
+        vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))
+    example_hop_length = int(round(
+        vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate))
+    log_mel_examples = mel_features.frame(
+        log_mel,
+        window_length=example_window_length,
+        hop_length=example_hop_length)
+    if return_tensor:
+        log_mel_examples = torch.tensor(
+            log_mel_examples, requires_grad=True)[:, None, :, :].float()
+    return log_mel_examples
+def wavfile_to_examples(wav_file, return_tensor=True):
+    """Convenience wrapper around waveform_to_examples() for a common WAV format.
+  Args:
+    wav_file: String path to a file, or a file-like object. The file
+    is assumed to contain WAV audio data with signed 16-bit PCM samples.
+    torch: Return data as a Pytorch tensor ready for VGGish
+  Returns:
+    See waveform_to_examples.
+  """
+    wav_data, sr = sf.read(wav_file, dtype='int16')
+    assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype
+    samples = wav_data / 32768.0  # Convert to [-1.0, +1.0]
+    return waveform_to_examples(samples, sr, return_tensor)

avs.code/v1m.code/model/audio/torchvggish/vggish_params.py ADDED Viewed

	@@ -0,0 +1,53 @@

+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Global parameters for the VGGish model.
+See vggish_slim.py for more information.
+"""
+# Architectural constants.
+NUM_FRAMES = 96  # Frames in input mel-spectrogram patch.
+NUM_BANDS = 64  # Frequency bands in input mel-spectrogram patch.
+EMBEDDING_SIZE = 128  # Size of embedding layer.
+# Hyperparameters used in feature and example generation.
+SAMPLE_RATE = 16000
+STFT_WINDOW_LENGTH_SECONDS = 0.025
+STFT_HOP_LENGTH_SECONDS = 0.010
+NUM_MEL_BINS = NUM_BANDS
+MEL_MIN_HZ = 125
+MEL_MAX_HZ = 7500
+LOG_OFFSET = 0.01  # Offset used for stabilized log of input mel-spectrogram.
+EXAMPLE_WINDOW_SECONDS = 0.96  # Each example contains 96 10ms frames
+EXAMPLE_HOP_SECONDS = 0.96  # with zero overlap.
+# Parameters used for embedding postprocessing.
+PCA_EIGEN_VECTORS_NAME = 'pca_eigen_vectors'
+PCA_MEANS_NAME = 'pca_means'
+QUANTIZE_MIN_VAL = -2.0
+QUANTIZE_MAX_VAL = +2.0
+# Hyperparameters used in training.
+INIT_STDDEV = 0.01  # Standard deviation used to initialize weights.
+LEARNING_RATE = 1e-4  # Learning rate for the Adam optimizer.
+ADAM_EPSILON = 1e-8  # Epsilon for the Adam optimizer.
+# Names of ops, tensors, and features.
+INPUT_OP_NAME = 'vggish/input_features'
+INPUT_TENSOR_NAME = INPUT_OP_NAME + ':0'
+OUTPUT_OP_NAME = 'vggish/embedding'
+OUTPUT_TENSOR_NAME = OUTPUT_OP_NAME + ':0'
+AUDIO_EMBEDDING_FEATURE_NAME = 'audio_embedding'

avs.code/v1m.code/model/aural_fuser.py ADDED Viewed

	@@ -0,0 +1,567 @@

+import math
+import torch
+import torch.nn as nn
+from model.audio.torchvggish import vggish
+from timm.models.layers import DropPath, trunc_normal_
+from model.visual.sam2.modeling.position_encoding import PositionEmbeddingSine
+class ProjectionHead(nn.Module):
+    def __init__(self, dim_in, proj_dim=256, norm_act=nn.BatchNorm2d, conv_layer=nn.Conv2d):
+        super().__init__()
+        self.proj = nn.Sequential(
+            conv_layer(dim_in, proj_dim, kernel_size=1),
+            norm_act(proj_dim),
+            conv_layer(proj_dim, proj_dim, kernel_size=1),
+        )
+    def forward(self, x):
+        return torch.nn.functional.normalize(self.proj(x), p=2, dim=1)
+class AuralFuser(torch.nn.Module):
+    """Fuses VGGish audio with SAM2 FPN maps via patch embeds, fusion blocks, and projection heads."""
+    def __init__(self, hyp_param):
+        self.hyp_param = hyp_param
+        super().__init__()
+        self.vgg = vggish.VGGish(self.hyp_param.audio)
+        if not getattr(self.hyp_param, "train_vggish", False):
+            for p in self.vgg.parameters():
+                p.requires_grad = False
+        self.position_encoding_func = PositionEmbeddingSine(num_pos_feats=256, normalize=True, scale=None,
+                                                            temperature=10000)
+        # Populated in main.py / inference.py via Hydra compose('auralfuser/architecture.yaml') → hyp_param.aural_fuser
+        if not hasattr(self.hyp_param, "aural_fuser") or self.hyp_param.aural_fuser is None:
+            raise ValueError(
+                "hyp_param.aural_fuser is missing; load it with Hydra compose before constructing AuralFuser."
+            )
+        arch_cfg = self.hyp_param.aural_fuser
+        _patch_cfgs = [tuple(i) for i in arch_cfg["patch_cfgs"]]
+        _f_depths = arch_cfg["f_depths"]
+        _block_kw = dict(arch_cfg["block_kw"])
+        _block_kw["norm_layer"] = nn.LayerNorm
+        _one_d_kw = dict(arch_cfg["one_d_kw"])
+        _one_d_kw["norm_layer"] = nn.LayerNorm
+        self.patch_embeds = nn.ModuleList(
+            nn.Conv2d(256, 256, kernel_size=k, stride=s) for k, s in _patch_cfgs
+        )
+        self.f_blocks = nn.ModuleList(
+            nn.ModuleList([Block(**_block_kw) for _ in range(n)]) for n in _f_depths
+        )
+        self.a_blocks = nn.ModuleList(
+            nn.ModuleList([OneDBlock(**_one_d_kw) for _ in range(3)]) for _ in range(3)
+        )
+        self.fusion_modules = nn.ModuleList(
+            AudioVisualFusionModule(in_channels=256, mode='dot') for _ in range(3)
+        )
+        self.smooth_convs = nn.ModuleList(
+            nn.Conv2d(256, 256, kernel_size=1, stride=1, padding=0) for _ in range(2)
+        )
+        self.train_proj_v1 = ProjectionHead(dim_in=256, proj_dim=128)
+        self.train_proj_a1 = ProjectionHead(dim_in=256, norm_act=nn.BatchNorm1d, conv_layer=nn.Conv1d, proj_dim=128)
+    @staticmethod
+    def positionalencoding1d(d_model, length):
+        if d_model % 2 != 0:
+            raise ValueError("Cannot use sin/cos positional encoding with "
+                             "odd dim (got dim={:d})".format(d_model))
+        pe = torch.zeros(length, d_model)
+        position = torch.arange(0, length).unsqueeze(1)
+        div_term = torch.exp((torch.arange(0, d_model, 2, dtype=torch.float) *
+                              -(math.log(10000.0) / d_model)))
+        pe[:, 0::2] = torch.sin(position.float() * div_term)
+        pe[:, 1::2] = torch.cos(position.float() * div_term)
+        return pe
+    def forward(self, feature_dicts, spect=None):
+        image_embed_shape = [self.hyp_param.image_embedding_size] * 2
+        H, W = image_embed_shape[0], image_embed_shape[1]
+        d = torch.cat(
+            [
+                self.vgg(spect[:, 0, ...].unsqueeze(1)),
+                self.vgg(spect[:, 1, ...].unsqueeze(1)),
+            ],
+            dim=-1,
+        )
+        length = d.shape[-1]
+        fix_audio_pos = self.positionalencoding1d(length, 1).squeeze().to(spect.device)
+        fpn = list(feature_dicts["backbone_fpn"])
+        patch_embeds = list(self.patch_embeds)
+        f_blocks = list(self.f_blocks)
+        a_blocks = list(self.a_blocks)
+        tpavi = list(self.fusion_modules)
+        smooths = [None, self.smooth_convs[0], self.smooth_convs[1]]
+        feats = [None, None, None]
+        d_outputs = []
+        for i in range(3):
+            x = fpn[i]
+            x = patch_embeds[i](x)
+            x_pos = self.position_encoding_func(x)
+            x = x.flatten(2).permute(0, 2, 1)
+            x_pos = x_pos.flatten(2).permute(0, 2, 1)
+            if i == 0:
+                x = x + x_pos
+                d = d + fix_audio_pos
+            else:
+                x = x + feats[i - 1]
+                x = smooths[i](
+                    x.permute(0, 2, 1).reshape(x.shape[0], 256, H, W)
+                ).flatten(2).permute(0, 2, 1)
+                x = x + x_pos
+                d = d + fix_audio_pos
+            for blks in f_blocks[i]:
+                x = blks(x, H, W, x_pos)
+            for blks in a_blocks[i]:
+                d = blks(d, fix_audio_pos)
+            x = x + x_pos
+            d = d + fix_audio_pos
+            x, d_out, _, _ = tpavi[i](x, H, W, x_pos, d, length)
+            d = d_out
+            feats[i] = x
+            d_outputs.append(d_out)
+        a, b, c = feats
+        d1, d2, d3 = d_outputs
+        feature_residual = [a, b, c]
+        audio_out = [d1, d2, d3]
+        proj_feature_out = [
+            [
+                self.train_proj_v1(a.permute(0, 2, 1).reshape(-1, 256, *image_embed_shape)),
+                self.train_proj_v1(b.permute(0, 2, 1).reshape(-1, 256, *image_embed_shape)),
+                self.train_proj_v1(c.permute(0, 2, 1).reshape(-1, 256, *image_embed_shape)),
+            ],
+            [
+                self.train_proj_a1(d1.unsqueeze(-1)),
+                self.train_proj_a1(d2.unsqueeze(-1)),
+                self.train_proj_a1(d3.unsqueeze(-1)),
+            ],
+        ]
+        return feature_residual, audio_out, proj_feature_out
+class AudioVisualFusionModule(nn.Module):
+    def __init__(self, in_channels, inter_channels=None, mode='dot',
+                 dimension=3):
+        super().__init__()
+        assert mode == 'dot'
+        self.mode = mode
+        self.dimension = dimension
+        self.in_channels = in_channels
+        self.inter_channels = in_channels // 2
+        self.align_channel = nn.Conv1d(256, in_channels, kernel_size=1)
+        self.align_channel_back = nn.Conv1d(in_channels, 128, kernel_size=1)
+        self.norm_layer = nn.LayerNorm(in_channels)
+        if dimension == 3:
+            conv_nd = nn.Conv3d
+            bn = nn.BatchNorm3d
+        elif dimension == 2:
+            conv_nd = nn.Conv2d
+            bn = nn.BatchNorm2d
+        else:
+            conv_nd = nn.Conv1d
+            bn = nn.BatchNorm1d
+        self.g = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels, kernel_size=1)
+        self.W_z = nn.Sequential(
+            conv_nd(in_channels=self.inter_channels, out_channels=self.in_channels, kernel_size=1),
+            bn(self.in_channels)
+        )
+        nn.init.constant_(self.W_z[1].weight, 0)
+        nn.init.constant_(self.W_z[1].bias, 0)
+        self.W_z2 = nn.Sequential(
+            nn.Conv1d(in_channels=self.inter_channels, out_channels=self.in_channels, kernel_size=1),
+            nn.BatchNorm1d(self.in_channels)
+        )
+        nn.init.constant_(self.W_z2[1].weight, 0)
+        nn.init.constant_(self.W_z2[1].bias, 0)
+        self.norm_layer2 = nn.LayerNorm(self.in_channels)
+        self.q_frame = nn.Conv3d(in_channels=self.in_channels, out_channels=self.inter_channels, kernel_size=1)
+        self.k_frame = nn.Conv3d(in_channels=self.in_channels, out_channels=self.inter_channels, kernel_size=1)
+        self.v_frame = nn.Conv3d(in_channels=self.in_channels, out_channels=self.inter_channels, kernel_size=1)
+        self.q_audio = nn.Conv1d(self.in_channels, self.inter_channels, kernel_size=1)
+        self.k_audio = nn.Conv1d(self.in_channels, self.inter_channels, kernel_size=1)
+        self.v_audio = nn.Conv1d(self.in_channels, self.inter_channels, kernel_size=1)
+    def forward(self, frame, H_x, W_x, tmp1, audio, tmp2):
+        frame = frame.permute(0, 2, 1)
+        frame = frame.reshape(frame.shape[0], frame.shape[1], H_x, W_x)
+        frame = frame.unsqueeze(2)
+        audio = self.align_channel(audio.unsqueeze(-1))
+        batch_size, _ = frame.size(0), frame.size(1)
+        q_frame = self.q_frame(frame).reshape(1, -1, self.inter_channels)
+        k_frame = self.k_frame(frame).reshape(1, -1, self.inter_channels)
+        v_frame = self.v_frame(frame).reshape(1, -1, self.inter_channels)
+        q_audio = self.q_audio(audio).reshape(1, -1, self.inter_channels)
+        k_audio = self.k_audio(audio).reshape(1, -1, self.inter_channels)
+        v_audio = self.v_audio(audio).reshape(1, -1, self.inter_channels)
+        f = torch.matmul(q_frame, k_audio.mT)
+        f_normalise = f / f.size(1)
+        frame_attn = torch.matmul(f_normalise, v_audio)
+        frame_attn = frame_attn.permute(0, 2, 1).contiguous()
+        frame_attn = frame_attn.view(batch_size, self.inter_channels, *frame.size()[2:])
+        frame_attn = self.W_z(frame_attn)
+        frame = frame_attn + frame
+        frame = frame.permute(0, 2, 3, 4, 1)
+        frame = self.norm_layer(frame)
+        frame = frame.permute(0, 4, 1, 2, 3)
+        frame = frame.squeeze().flatten(start_dim=2).permute(0, 2, 1)
+        a = torch.matmul(q_audio, k_frame.mT)
+        a_normalise = a / a.size(-1)
+        audio_attn = torch.matmul(a_normalise, v_frame)
+        audio_attn = audio_attn.permute(0, 2, 1).contiguous()
+        audio_attn = audio_attn.view(batch_size, self.inter_channels).unsqueeze(-1)
+        audio_attn = self.W_z2(audio_attn)
+        audio = audio_attn + audio
+        audio = self.norm_layer2(audio.squeeze()).squeeze()
+        return frame, audio, frame_attn, audio_attn
+class OneDBlock(nn.Module):
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, sr_ratio=1, linear=False):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = OneDAttention(
+            dim,
+            num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
+            attn_drop=attn_drop, proj_drop=drop, sr_ratio=sr_ratio, linear=linear)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = OneDMlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop,
+                           linear=linear)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, x, _pos):
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class OneDAttention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., sr_ratio=1,
+                 linear=False):
+        super().__init__()
+        assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}."
+        self.dim = dim
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        self.q = nn.Linear(dim, dim, bias=qkv_bias)
+        self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.linear = linear
+        self.sr_ratio = sr_ratio
+        if not linear:
+            if sr_ratio > 1:
+                self.sr = nn.Conv2d(dim, dim, kernel_size=sr_ratio, stride=sr_ratio)
+                self.norm = nn.LayerNorm(dim)
+        else:
+            self.pool = nn.AdaptiveAvgPool2d(7)
+            self.sr = nn.Conv2d(dim, dim, kernel_size=1, stride=1)
+            self.norm = nn.LayerNorm(dim)
+            self.act = nn.GELU()
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, x):
+        x = x.unsqueeze(0)
+        B, N, C = x.shape
+        q = self.q(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        kv = self.kv(x).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        k, v = kv[0], kv[1]
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        x = x.squeeze()
+        return x
+class OneDMlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0., linear=False):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.dwconv = DWConv(hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+        self.linear = linear
+        if self.linear:
+            self.relu = nn.ReLU(inplace=True)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, x):
+        x = self.fc1(x)
+        if self.linear:
+            x = self.relu(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class Block(nn.Module):
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, sr_ratio=1, linear=False):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
+            attn_drop=attn_drop, proj_drop=drop, sr_ratio=sr_ratio, linear=linear)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop, linear=linear)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, x, H, W, _pos):
+        x = x + self.drop_path(self.attn(self.norm1(x), H, W))
+        x = x + self.drop_path(self.mlp(self.norm2(x), H, W))
+        return x
+class Attention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., sr_ratio=1,
+                 linear=False):
+        super().__init__()
+        assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}."
+        self.dim = dim
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        self.q = nn.Linear(dim, dim, bias=qkv_bias)
+        self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.linear = linear
+        self.sr_ratio = sr_ratio
+        if not linear:
+            if sr_ratio > 1:
+                self.sr = nn.Conv2d(dim, dim, kernel_size=sr_ratio, stride=sr_ratio)
+                self.norm = nn.LayerNorm(dim)
+        else:
+            self.pool = nn.AdaptiveAvgPool2d(7)
+            self.sr = nn.Conv2d(dim, dim, kernel_size=1, stride=1)
+            self.norm = nn.LayerNorm(dim)
+            self.act = nn.GELU()
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        q = self.q(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        if not self.linear:
+            if self.sr_ratio > 1:
+                x_ = x.permute(0, 2, 1).reshape(B, C, H, W)
+                x_ = self.sr(x_).reshape(B, C, -1).permute(0, 2, 1)
+                x_ = self.norm(x_)
+                kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+            else:
+                kv = self.kv(x).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        else:
+            x_ = x.permute(0, 2, 1).reshape(B, C, H, W)
+            x_ = self.sr(self.pool(x_)).reshape(B, C, -1).permute(0, 2, 1)
+            x_ = self.norm(x_)
+            x_ = self.act(x_)
+            kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        k, v = kv[0], kv[1]
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0., linear=False):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.dwconv = DWConv(hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+        self.linear = linear
+        if self.linear:
+            self.relu = nn.ReLU(inplace=True)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, x, H, W):
+        x = self.fc1(x)
+        if self.linear:
+            x = self.relu(x)
+        x = self.dwconv(x, H, W)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class DWConv(nn.Module):
+    def __init__(self, dim=768):
+        super(DWConv, self).__init__()
+        self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        x = x.transpose(1, 2).view(B, C, H, W)
+        x = self.dwconv(x)
+        x = x.flatten(2).transpose(1, 2)
+        return x

avs.code/v1m.code/model/mymodel.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import logging
+from typing import List, Optional, Tuple, Union
+import numpy
+import numpy as np
+import torch
+from PIL.Image import Image
+from model.visual.sam2.modeling.sam2_base import SAM2Base
+from model.visual.sam2.modeling.backbones.hieradet import Hiera
+from model.visual.sam2.modeling.backbones.image_encoder import FpnNeck
+from model.visual.sam2.modeling.backbones.image_encoder import ImageEncoder
+from model.visual.sam2.modeling.position_encoding import PositionEmbeddingSine
+from model.visual.sam2.modeling.memory_attention import MemoryAttention
+from model.visual.sam2.modeling.memory_attention import MemoryAttentionLayer
+from model.visual.sam2.modeling.sam.transformer import RoPEAttention
+from model.visual.sam2.modeling.memory_encoder import MemoryEncoder
+from model.visual.sam2.modeling.memory_encoder import MaskDownSampler
+from model.visual.sam2.modeling.memory_encoder import Fuser
+from model.visual.sam2.modeling.memory_encoder import CXBlock
+from model.visual.sam2.utils.transforms import SAM2Transforms
+from model.visual.sam2.modeling.backbones.hieradet import do_pool
+from model.visual.sam2.modeling.backbones.utils import (
+    PatchEmbed,
+    window_partition,
+    window_unpartition,
+)
+class AVmodel(torch.nn.Module):
+    """End-to-end AV segmentation: SAM2 visual backbone + AuralFuser audio-visual fusion + tracking head."""
+    def __init__(self, param, mask_threshold=0.0, max_hole_area=0.0, max_sprinkle_area=0.0, ):
+        super().__init__()
+        self.param = param
+        self.mask_threshold = mask_threshold
+        self._bb_feat_sizes = [(int(self.param.image_size / 4), int(self.param.image_size / 4)),
+                               (int(self.param.image_size / 8), int(self.param.image_size / 8)),
+                               (int(self.param.image_size / 16), int(self.param.image_size / 16))]
+        from model.visual.sam2.build_sam import build_sam2_visual_predictor
+        self.v_model = build_sam2_visual_predictor(self.param.sam_config_path, self.param.backbone_weight,
+                                                   apply_postprocessing=True, mode='train')
+        self._transforms = SAM2Transforms(
+            resolution=self.v_model.image_size,
+            mask_threshold=mask_threshold,
+            max_hole_area=max_hole_area,
+            max_sprinkle_area=max_sprinkle_area,
+        )
+        from model.aural_fuser import AuralFuser
+        self.aural_fuser = AuralFuser(hyp_param=self.param)
+    def _prepare_backbone_features(self, backbone_out):
+        """Prepare and flatten visual features."""
+        backbone_out = backbone_out.copy()
+        assert len(backbone_out["backbone_fpn"]) == len(backbone_out["vision_pos_enc"])
+        assert len(backbone_out["backbone_fpn"]) >= self.v_model.num_feature_levels
+        feature_maps = backbone_out["backbone_fpn"][-self.v_model.num_feature_levels:]
+        vision_pos_embeds = backbone_out["vision_pos_enc"][-self.v_model.num_feature_levels:]
+        feat_sizes = [(x.shape[-2], x.shape[-1]) for x in vision_pos_embeds]
+        vision_feats = [x.flatten(2).permute(2, 0, 1) for x in feature_maps]
+        vision_pos_embeds = [x.flatten(2).permute(2, 0, 1) for x in vision_pos_embeds]
+        return backbone_out, vision_feats, vision_pos_embeds, feat_sizes
+    def forward_frame(self, frame_):
+        frame = torch.nn.functional.interpolate(frame_, (self.param.image_size, self.param.image_size),
+                                                antialias=True, align_corners=False, mode='bilinear')
+        return self.v_model.image_encoder(frame)
+    def forward(self, frames, spect, prompts, sam_process=False):
+        """Fuse audio into FPN features, then run SAM2 tracking. `sam_process` is reserved for prompt path."""
+        backbone_feats = self.v_model.forward_image(frames, pre_compute=False)
+        audio_residual_feats = self.aural_fuser(backbone_feats, spect)
+        visual_resfeats, audio_resfeats, proj_feats = audio_residual_feats
+        map_res = visual_resfeats[::-1]
+        vec_res = audio_resfeats[::-1]
+        av_feats = (map_res, vec_res)
+        backbone_feats = self.v_model.precompute_high_res_features(backbone_feats)
+        backbone_feats = self.v_model.dont_prepare_prompt_inputs(backbone_feats, num_frames=frames.shape[0],
+                cond_frame=int(frames.shape[0]/2) if self.training else 0)
+        outputs = self.v_model.forward_tracking_wo_prompt(backbone_feats, audio_res=av_feats)
+        return outputs, proj_feats
+    @property
+    def device(self) -> torch.device:
+        return self.v_model.device
+    def freeze_sam_parameters(self):
+        self.v_model.eval()
+        for name, parameter in self.v_model.named_parameters():
+            parameter.requires_grad = False

avs.code/v1m.code/model/visual/sam2/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from hydra import initialize_config_module
+from hydra.core.global_hydra import GlobalHydra
+if not GlobalHydra.instance().is_initialized():
+    initialize_config_module("configs", version_base="1.2")

avs.code/v1m.code/model/visual/sam2/build_sam.py ADDED Viewed

	@@ -0,0 +1,171 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+import os
+import torch
+from hydra import compose
+from hydra.utils import instantiate
+from omegaconf import OmegaConf
+'''
+import sam2
+# Check if the user is running Python from the parent directory of the sam2 repo
+# (i.e. the directory where this repo is cloned into) -- this is not supported since
+# it could shadow the sam2 package and cause issues.
+if os.path.isdir(os.path.join(sam2.__path__[0], "sam2")):
+    # If the user has "sam2/sam2" in their path, they are likey importing the repo itself
+    # as "sam2" rather than importing the "sam2" python package (i.e. "sam2/sam2" directory).
+    # This typically happens because the user is running Python from the parent directory
+    # that contains the sam2 repo they cloned.
+    raise RuntimeError(
+        "You're likely running Python from the parent directory of the sam2 repository "
+        "(i.e. the directory where https://github.com/facebookresearch/sam2 is cloned into). "
+        "This is not supported since the `sam2` Python package could be shadowed by the "
+        "repository name (the repository is also named `sam2` and contains the Python package "
+        "in `sam2/sam2`). Please run Python from another directory (e.g. from the repo dir "
+        "rather than its parent dir, or from your home directory) after installing SAM 2."
+    )
+'''
+HF_MODEL_ID_TO_FILENAMES = {
+    "facebook/sam2-hiera-tiny": (
+        "sam2/sam2_hiera_t.yaml",
+        "sam2_hiera_tiny.pt",
+    ),
+    "facebook/sam2-hiera-small": (
+        "sam2/sam2_hiera_s.yaml",
+        "sam2_hiera_small.pt",
+    ),
+    "facebook/sam2-hiera-base-plus": (
+        "sam2/sam2_hiera_b+.yaml",
+        "sam2_hiera_base_plus.pt",
+    ),
+    "facebook/sam2-hiera-large": (
+        "sam2/sam2_hiera_l.yaml",
+        "sam2_hiera_large.pt",
+    ),
+    "facebook/sam2.1-hiera-tiny": (
+        "sam2.1/sam2.1_hiera_t.yaml",
+        "sam2.1_hiera_tiny.pt",
+    ),
+    "facebook/sam2.1-hiera-small": (
+        "sam2.1/sam2.1_hiera_s.yaml",
+        "sam2.1_hiera_small.pt",
+    ),
+    "facebook/sam2.1-hiera-base-plus": (
+        "sam2.1/sam2.1_hiera_b+.yaml",
+        "sam2.1_hiera_base_plus.pt",
+    ),
+    "facebook/sam2.1-hiera-large": (
+        "sam2.1/sam2.1_hiera_l.yaml",
+        "sam2.1_hiera_large.pt",
+    ),
+}
+def build_sam2(
+    config_file,
+    ckpt_path=None,
+    device="cuda",
+    mode="eval",
+    hydra_overrides_extra=[],
+    apply_postprocessing=True,
+    **kwargs,
+):
+    if apply_postprocessing:
+        hydra_overrides_extra = hydra_overrides_extra.copy()
+        hydra_overrides_extra += [
+            # dynamically fall back to multi-mask if the single mask is not stable
+            "++model.sam_mask_decoder_extra_args.dynamic_multimask_via_stability=true",
+            "++model.sam_mask_decoder_extra_args.dynamic_multimask_stability_delta=0.05",
+            "++model.sam_mask_decoder_extra_args.dynamic_multimask_stability_thresh=0.98",
+        ]
+    # Read config and init model
+    cfg = compose(config_name=config_file, overrides=hydra_overrides_extra)
+    OmegaConf.resolve(cfg)
+    model = instantiate(cfg.model, _recursive_=True)
+    _load_checkpoint(model, ckpt_path)
+    model = model.to(device)
+    if mode == "eval":
+        model.eval()
+    return model
+def build_sam2_visual_predictor(
+    config_file,
+    ckpt_path=None,
+    mode="eval",
+    hydra_overrides_extra=[],
+    apply_postprocessing=True,
+    **kwargs,
+):
+    # visual
+    hydra_overrides = []
+        # "++model._target_=model.visual.sam2.organised_sam2_train.SAM2Train",
+    # ]
+    # hydra_overrides = [
+    #     "++model._target_=sam2.sam2_video_predictor.SAM2VideoPredictor",
+    # ]
+    if apply_postprocessing:
+        hydra_overrides_extra = hydra_overrides_extra.copy()
+        hydra_overrides_extra += [
+            # dynamically fall back to multi-mask if the single mask is not stable
+            # "++model.sam_mask_decoder_extra_args.dynamic_multimask_via_stability=true",
+            # "++model.sam_mask_decoder_extra_args.dynamic_multimask_stability_delta=0.05",
+            # "++model.sam_mask_decoder_extra_args.dynamic_multimask_stability_thresh=0.98",
+            # the sigmoid mask logits on interacted frames with clicks in the memory encoder so that the encoded masks are exactly as what users see from clicking
+            "++model.binarize_mask_from_pts_for_mem_enc=true",
+            # fill small holes in the low-res masks up to `fill_hole_area` (before resizing them to the original video resolution)
+            # "++model.fill_hole_area=8",
+        ]
+    hydra_overrides.extend(hydra_overrides_extra)
+    # Read config and init model
+    cfg = compose(config_name=config_file, overrides=hydra_overrides)
+    OmegaConf.resolve(cfg)
+    model = instantiate(cfg.model, _recursive_=True)
+    _load_checkpoint(model, ckpt_path)
+    if mode == "eval":
+        model.eval()
+    return model
+def _hf_download(model_id):
+    from huggingface_hub import hf_hub_download
+    config_name, checkpoint_name = HF_MODEL_ID_TO_FILENAMES[model_id]
+    ckpt_path = hf_hub_download(repo_id=model_id, filename=checkpoint_name)
+    return config_name, ckpt_path
+def build_sam2_hf(model_id, **kwargs):
+    config_name, ckpt_path = _hf_download(model_id)
+    return build_sam2(config_file=config_name, ckpt_path=ckpt_path, **kwargs)
+# def build_sam2_video_predictor_hf(model_id, **kwargs):
+#     config_name, ckpt_path = _hf_download(model_id)
+#     return build_sam2_video_predictor(
+#         config_file=config_name, ckpt_path=ckpt_path, **kwargs
+#     )
+def _load_checkpoint(model, ckpt_path):
+    if ckpt_path is not None:
+        sd = torch.load(ckpt_path, map_location="cpu", weights_only=True)["model"]
+        missing_keys, unexpected_keys = model.load_state_dict(sd)
+        if missing_keys:
+            logging.error(missing_keys)
+            raise RuntimeError()
+        if unexpected_keys:
+            logging.error(unexpected_keys)
+            raise RuntimeError()
+        logging.info("Loaded checkpoint sucessfully")

avs.code/v1m.code/model/visual/sam2/modeling/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.

avs.code/v1m.code/model/visual/sam2/modeling/backbones/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.

avs.code/v1m.code/model/visual/sam2/modeling/backbones/hieradet.py ADDED Viewed

	@@ -0,0 +1,317 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+from functools import partial
+from typing import List, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from iopath.common.file_io import g_pathmgr
+from model.visual.sam2.modeling.backbones.utils import (
+    PatchEmbed,
+    window_partition,
+    window_unpartition,
+)
+from model.visual.sam2.modeling.sam2_utils import DropPath, MLP
+def do_pool(x: torch.Tensor, pool: nn.Module, norm: nn.Module = None) -> torch.Tensor:
+    if pool is None:
+        return x
+    # (B, H, W, C) -> (B, C, H, W)
+    x = x.permute(0, 3, 1, 2)
+    x = pool(x)
+    # (B, C, H', W') -> (B, H', W', C)
+    x = x.permute(0, 2, 3, 1)
+    if norm:
+        x = norm(x)
+    return x
+class MultiScaleAttention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        dim_out: int,
+        num_heads: int,
+        q_pool: nn.Module = None,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.dim_out = dim_out
+        self.num_heads = num_heads
+        self.q_pool = q_pool
+        self.qkv = nn.Linear(dim, dim_out * 3)
+        self.proj = nn.Linear(dim_out, dim_out)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, H, W, _ = x.shape
+        # qkv with shape (B, H * W, 3, nHead, C)
+        qkv = self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1)
+        # q, k, v with shape (B, H * W, nheads, C)
+        q, k, v = torch.unbind(qkv, 2)
+        # Q pooling (for downsample at stage changes)
+        if self.q_pool:
+            q = do_pool(q.reshape(B, H, W, -1), self.q_pool)
+            H, W = q.shape[1:3]  # downsampled shape
+            q = q.reshape(B, H * W, self.num_heads, -1)
+        # Torch's SDPA expects [B, nheads, H*W, C] so we transpose
+        x = F.scaled_dot_product_attention(
+            q.transpose(1, 2),
+            k.transpose(1, 2),
+            v.transpose(1, 2),
+        )
+        # Transpose back
+        x = x.transpose(1, 2)
+        x = x.reshape(B, H, W, -1)
+        x = self.proj(x)
+        return x
+class MultiScaleBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        dim_out: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        drop_path: float = 0.0,
+        norm_layer: Union[nn.Module, str] = "LayerNorm",
+        q_stride: Tuple[int, int] = None,
+        act_layer: nn.Module = nn.GELU,
+        window_size: int = 0,
+    ):
+        super().__init__()
+        if isinstance(norm_layer, str):
+            norm_layer = partial(getattr(nn, norm_layer), eps=1e-6)
+        self.dim = dim
+        self.dim_out = dim_out
+        self.norm1 = norm_layer(dim)
+        self.window_size = window_size
+        self.pool, self.q_stride = None, q_stride
+        if self.q_stride:
+            self.pool = nn.MaxPool2d(
+                kernel_size=q_stride, stride=q_stride, ceil_mode=False
+            )
+        self.attn = MultiScaleAttention(
+            dim,
+            dim_out,
+            num_heads=num_heads,
+            q_pool=self.pool,
+        )
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim_out)
+        self.mlp = MLP(
+            dim_out,
+            int(dim_out * mlp_ratio),
+            dim_out,
+            num_layers=2,
+            activation=act_layer,
+        )
+        if dim != dim_out:
+            self.proj = nn.Linear(dim, dim_out)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        shortcut = x  # B, H, W, C
+        x = self.norm1(x)
+        # Skip connection
+        if self.dim != self.dim_out:
+            shortcut = do_pool(self.proj(x), self.pool)
+        # Window partition
+        window_size = self.window_size
+        if window_size > 0:
+            H, W = x.shape[1], x.shape[2]
+            x, pad_hw = window_partition(x, window_size)
+        # Window Attention + Q Pooling (if stage change)
+        x = self.attn(x)
+        if self.q_stride:
+            # Shapes have changed due to Q pooling
+            window_size = self.window_size // self.q_stride[0]
+            H, W = shortcut.shape[1:3]
+            pad_h = (window_size - H % window_size) % window_size
+            pad_w = (window_size - W % window_size) % window_size
+            pad_hw = (H + pad_h, W + pad_w)
+        # Reverse window partition
+        if self.window_size > 0:
+            x = window_unpartition(x, window_size, pad_hw, (H, W))
+        x = shortcut + self.drop_path(x)
+        # MLP
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class Hiera(nn.Module):
+    """
+    Reference: https://arxiv.org/abs/2306.00989
+    """
+    def __init__(
+        self,
+        embed_dim: int = 96,  # initial embed dim
+        num_heads: int = 1,  # initial number of heads
+        drop_path_rate: float = 0.0,  # stochastic depth
+        q_pool: int = 3,  # number of q_pool stages
+        q_stride: Tuple[int, int] = (2, 2),  # downsample stride bet. stages
+        stages: Tuple[int, ...] = (2, 3, 16, 3),  # blocks per stage
+        dim_mul: float = 2.0,  # dim_mul factor at stage shift
+        head_mul: float = 2.0,  # head_mul factor at stage shift
+        window_pos_embed_bkg_spatial_size: Tuple[int, int] = (14, 14),
+        # window size per stage, when not using global att.
+        window_spec: Tuple[int, ...] = (
+            8,
+            4,
+            14,
+            7,
+        ),
+        # global attn in these blocks
+        global_att_blocks: Tuple[int, ...] = (
+            12,
+            16,
+            20,
+        ),
+        weights_path=None,
+        return_interm_layers=True,  # return feats from every stage
+    ):
+        super().__init__()
+        assert len(stages) == len(window_spec)
+        self.window_spec = window_spec
+        depth = sum(stages)
+        self.q_stride = q_stride
+        self.stage_ends = [sum(stages[:i]) - 1 for i in range(1, len(stages) + 1)]
+        assert 0 <= q_pool <= len(self.stage_ends[:-1])
+        self.q_pool_blocks = [x + 1 for x in self.stage_ends[:-1]][:q_pool]
+        self.return_interm_layers = return_interm_layers
+        self.patch_embed = PatchEmbed(
+            embed_dim=embed_dim,
+        )
+        # Which blocks have global att?
+        self.global_att_blocks = global_att_blocks
+        # Windowed positional embedding (https://arxiv.org/abs/2311.05613)
+        self.window_pos_embed_bkg_spatial_size = window_pos_embed_bkg_spatial_size
+        self.pos_embed = nn.Parameter(
+            torch.zeros(1, embed_dim, *self.window_pos_embed_bkg_spatial_size)
+        )
+        self.pos_embed_window = nn.Parameter(
+            torch.zeros(1, embed_dim, self.window_spec[0], self.window_spec[0])
+        )
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, depth)
+        ]  # stochastic depth decay rule
+        cur_stage = 1
+        self.blocks = nn.ModuleList()
+        for i in range(depth):
+            dim_out = embed_dim
+            # lags by a block, so first block of
+            # next stage uses an initial window size
+            # of previous stage and final window size of current stage
+            window_size = self.window_spec[cur_stage - 1]
+            if self.global_att_blocks is not None:
+                window_size = 0 if i in self.global_att_blocks else window_size
+            if i - 1 in self.stage_ends:
+                dim_out = int(embed_dim * dim_mul)
+                num_heads = int(num_heads * head_mul)
+                cur_stage += 1
+            block = MultiScaleBlock(
+                dim=embed_dim,
+                dim_out=dim_out,
+                num_heads=num_heads,
+                drop_path=dpr[i],
+                q_stride=self.q_stride if i in self.q_pool_blocks else None,
+                window_size=window_size,
+            )
+            embed_dim = dim_out
+            self.blocks.append(block)
+        self.channel_list = (
+            [self.blocks[i].dim_out for i in self.stage_ends[::-1]]
+            if return_interm_layers
+            else [self.blocks[-1].dim_out]
+        )
+        if weights_path is not None:
+            with g_pathmgr.open(weights_path, "rb") as f:
+                chkpt = torch.load(f, map_location="cpu")
+            logging.info("loading Hiera", self.load_state_dict(chkpt, strict=False))
+    def _get_pos_embed(self, hw: Tuple[int, int]) -> torch.Tensor:
+        h, w = hw
+        window_embed = self.pos_embed_window
+        pos_embed = F.interpolate(self.pos_embed, size=(h, w), mode="bicubic")
+        pos_embed = pos_embed + window_embed.tile(
+            [x // y for x, y in zip(pos_embed.shape, window_embed.shape)]
+        )
+        pos_embed = pos_embed.permute(0, 2, 3, 1)
+        return pos_embed
+    def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
+        x = self.patch_embed(x)
+        # x: (B, H, W, C)
+        # Add pos embed
+        x = x + self._get_pos_embed(x.shape[1:3])
+        outputs = []
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+            if (i == self.stage_ends[-1]) or (
+                i in self.stage_ends and self.return_interm_layers
+            ):
+                feats = x.permute(0, 3, 1, 2)
+                outputs.append(feats)
+        return outputs
+    def get_layer_id(self, layer_name):
+        # https://github.com/microsoft/unilm/blob/master/beit/optim_factory.py#L33
+        num_layers = self.get_num_layers()
+        if layer_name.find("rel_pos") != -1:
+            return num_layers + 1
+        elif layer_name.find("pos_embed") != -1:
+            return 0
+        elif layer_name.find("patch_embed") != -1:
+            return 0
+        elif layer_name.find("blocks") != -1:
+            return int(layer_name.split("blocks")[1].split(".")[1]) + 1
+        else:
+            return num_layers + 1
+    def get_num_layers(self) -> int:
+        return len(self.blocks)

avs.code/v1m.code/model/visual/sam2/modeling/backbones/image_encoder.py ADDED Viewed

	@@ -0,0 +1,134 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import List, Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class ImageEncoder(nn.Module):
+    def __init__(
+        self,
+        trunk: nn.Module,
+        neck: nn.Module,
+        scalp: int = 0,
+    ):
+        super().__init__()
+        self.trunk = trunk
+        self.neck = neck
+        self.scalp = scalp
+        assert (
+            self.trunk.channel_list == self.neck.backbone_channel_list
+        ), f"Channel dims of trunk and neck do not match. Trunk: {self.trunk.channel_list}, neck: {self.neck.backbone_channel_list}"
+    def forward(self, sample: torch.Tensor):
+        # Forward through backbone
+        features, pos = self.neck(self.trunk(sample))
+        if self.scalp > 0:
+            # Discard the lowest resolution features
+            features, pos = features[: -self.scalp], pos[: -self.scalp]
+        src = features[-1]
+        output = {
+            "vision_features": src,
+            "vision_pos_enc": pos,
+            "backbone_fpn": features,
+        }
+        return output
+class FpnNeck(nn.Module):
+    """
+    A modified variant of Feature Pyramid Network (FPN) neck
+    (we remove output conv and also do bicubic interpolation similar to ViT
+    pos embed interpolation)
+    """
+    def __init__(
+        self,
+        position_encoding: nn.Module,
+        d_model: int,
+        backbone_channel_list: List[int],
+        kernel_size: int = 1,
+        stride: int = 1,
+        padding: int = 0,
+        fpn_interp_model: str = "bilinear",
+        fuse_type: str = "sum",
+        fpn_top_down_levels: Optional[List[int]] = None,
+    ):
+        """Initialize the neck
+        :param trunk: the backbone
+        :param position_encoding: the positional encoding to use
+        :param d_model: the dimension of the model
+        :param neck_norm: the normalization to use
+        """
+        super().__init__()
+        self.position_encoding = position_encoding
+        self.convs = nn.ModuleList()
+        self.backbone_channel_list = backbone_channel_list
+        self.d_model = d_model
+        for dim in backbone_channel_list:
+            current = nn.Sequential()
+            current.add_module(
+                "conv",
+                nn.Conv2d(
+                    in_channels=dim,
+                    out_channels=d_model,
+                    kernel_size=kernel_size,
+                    stride=stride,
+                    padding=padding,
+                ),
+            )
+            self.convs.append(current)
+        self.fpn_interp_model = fpn_interp_model
+        assert fuse_type in ["sum", "avg"]
+        self.fuse_type = fuse_type
+        # levels to have top-down features in its outputs
+        # e.g. if fpn_top_down_levels is [2, 3], then only outputs of level 2 and 3
+        # have top-down propagation, while outputs of level 0 and level 1 have only
+        # lateral features from the same backbone level.
+        if fpn_top_down_levels is None:
+            # default is to have top-down features on all levels
+            fpn_top_down_levels = range(len(self.convs))
+        self.fpn_top_down_levels = list(fpn_top_down_levels)
+    def forward(self, xs: List[torch.Tensor]):
+        out = [None] * len(self.convs)
+        pos = [None] * len(self.convs)
+        assert len(xs) == len(self.convs)
+        # fpn forward pass
+        # see https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/fpn.py
+        prev_features = None
+        # forward in top-down order (from low to high resolution)
+        n = len(self.convs) - 1
+        for i in range(n, -1, -1):
+            x = xs[i]
+            lateral_features = self.convs[n - i](x)
+            if i in self.fpn_top_down_levels and prev_features is not None:
+                top_down_features = F.interpolate(
+                    prev_features.to(dtype=torch.float32),
+                    scale_factor=2.0,
+                    mode=self.fpn_interp_model,
+                    align_corners=(
+                        None if self.fpn_interp_model == "nearest" else False
+                    ),
+                    antialias=False,
+                )
+                prev_features = lateral_features + top_down_features
+                if self.fuse_type == "avg":
+                    prev_features /= 2
+            else:
+                prev_features = lateral_features
+            x_out = prev_features
+            out[i] = x_out
+            pos[i] = self.position_encoding(x_out).to(x_out.dtype)
+        return out, pos

avs.code/v1m.code/model/visual/sam2/modeling/backbones/utils.py ADDED Viewed

	@@ -0,0 +1,95 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Some utilities for backbones, in particular for windowing"""
+from typing import Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def window_partition(x, window_size):
+    """
+    Partition into non-overlapping windows with padding if needed.
+    Args:
+        x (tensor): input tokens with [B, H, W, C].
+        window_size (int): window size.
+    Returns:
+        windows: windows after partition with [B * num_windows, window_size, window_size, C].
+        (Hp, Wp): padded height and width before partition
+    """
+    B, H, W, C = x.shape
+    pad_h = (window_size - H % window_size) % window_size
+    pad_w = (window_size - W % window_size) % window_size
+    if pad_h > 0 or pad_w > 0:
+        x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))
+    Hp, Wp = H + pad_h, W + pad_w
+    x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
+    windows = (
+        x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    )
+    return windows, (Hp, Wp)
+def window_unpartition(windows, window_size, pad_hw, hw):
+    """
+    Window unpartition into original sequences and removing padding.
+    Args:
+        x (tensor): input tokens with [B * num_windows, window_size, window_size, C].
+        window_size (int): window size.
+        pad_hw (Tuple): padded height and width (Hp, Wp).
+        hw (Tuple): original height and width (H, W) before padding.
+    Returns:
+        x: unpartitioned sequences with [B, H, W, C].
+    """
+    Hp, Wp = pad_hw
+    H, W = hw
+    B = windows.shape[0] // (Hp * Wp // window_size // window_size)
+    x = windows.view(
+        B, Hp // window_size, Wp // window_size, window_size, window_size, -1
+    )
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1)
+    if Hp > H or Wp > W:
+        x = x[:, :H, :W, :].contiguous()
+    return x
+class PatchEmbed(nn.Module):
+    """
+    Image to Patch Embedding.
+    """
+    def __init__(
+        self,
+        kernel_size: Tuple[int, ...] = (7, 7),
+        stride: Tuple[int, ...] = (4, 4),
+        padding: Tuple[int, ...] = (3, 3),
+        in_chans: int = 3,
+        embed_dim: int = 768,
+    ):
+        """
+        Args:
+            kernel_size (Tuple): kernel size of the projection layer.
+            stride (Tuple): stride of the projection layer.
+            padding (Tuple): padding size of the projection layer.
+            in_chans (int): Number of input image channels.
+            embed_dim (int):  embed_dim (int): Patch embedding dimension.
+        """
+        super().__init__()
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.proj(x)
+        # B C H W -> B H W C
+        x = x.permute(0, 2, 3, 1)
+        return x

avs.code/v1m.code/model/visual/sam2/modeling/memory_attention.py ADDED Viewed

	@@ -0,0 +1,169 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Optional
+import torch
+from torch import nn, Tensor
+from model.visual.sam2.modeling.sam.transformer import RoPEAttention
+from model.visual.sam2.modeling.sam2_utils import get_activation_fn, get_clones
+class MemoryAttentionLayer(nn.Module):
+    def __init__(
+        self,
+        activation: str,
+        cross_attention: nn.Module,
+        d_model: int,
+        dim_feedforward: int,
+        dropout: float,
+        pos_enc_at_attn: bool,
+        pos_enc_at_cross_attn_keys: bool,
+        pos_enc_at_cross_attn_queries: bool,
+        self_attention: nn.Module,
+    ):
+        super().__init__()
+        self.d_model = d_model
+        self.dim_feedforward = dim_feedforward
+        self.dropout_value = dropout
+        self.self_attn = self_attention
+        self.cross_attn_image = cross_attention
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.dropout3 = nn.Dropout(dropout)
+        self.activation_str = activation
+        self.activation = get_activation_fn(activation)
+        # Where to add pos enc
+        self.pos_enc_at_attn = pos_enc_at_attn
+        self.pos_enc_at_cross_attn_queries = pos_enc_at_cross_attn_queries
+        self.pos_enc_at_cross_attn_keys = pos_enc_at_cross_attn_keys
+    def _forward_sa(self, tgt, query_pos):
+        # Self-Attention
+        tgt2 = self.norm1(tgt)
+        q = k = tgt2 + query_pos if self.pos_enc_at_attn else tgt2
+        tgt2 = self.self_attn(q, k, v=tgt2)
+        tgt = tgt + self.dropout1(tgt2)
+        return tgt
+    def _forward_ca(self, tgt, memory, query_pos, pos, num_k_exclude_rope=0):
+        kwds = {}
+        if num_k_exclude_rope > 0:
+            assert isinstance(self.cross_attn_image, RoPEAttention)
+            kwds = {"num_k_exclude_rope": num_k_exclude_rope}
+        # Cross-Attention
+        tgt2 = self.norm2(tgt)
+        tgt2 = self.cross_attn_image(
+            q=tgt2 + query_pos if self.pos_enc_at_cross_attn_queries else tgt2,
+            k=memory + pos if self.pos_enc_at_cross_attn_keys else memory,
+            v=memory,
+            **kwds,
+        )
+        tgt = tgt + self.dropout2(tgt2)
+        return tgt
+    def forward(
+        self,
+        tgt,
+        memory,
+        pos: Optional[Tensor] = None,
+        query_pos: Optional[Tensor] = None,
+        num_k_exclude_rope: int = 0,
+    ) -> torch.Tensor:
+        # Self-Attn, Cross-Attn
+        tgt = self._forward_sa(tgt, query_pos)
+        tgt = self._forward_ca(tgt, memory, query_pos, pos, num_k_exclude_rope)
+        # MLP
+        tgt2 = self.norm3(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
+        tgt = tgt + self.dropout3(tgt2)
+        return tgt
+class MemoryAttention(nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        pos_enc_at_input: bool,
+        layer: nn.Module,
+        num_layers: int,
+        batch_first: bool = True,  # Do layers expect batch first input?
+    ):
+        super().__init__()
+        self.d_model = d_model
+        self.layers = get_clones(layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = nn.LayerNorm(d_model)
+        self.pos_enc_at_input = pos_enc_at_input
+        self.batch_first = batch_first
+    def forward(
+        self,
+        curr: torch.Tensor,  # self-attention inputs
+        memory: torch.Tensor,  # cross-attention inputs
+        curr_pos: Optional[Tensor] = None,  # pos_enc for self-attention inputs
+        memory_pos: Optional[Tensor] = None,  # pos_enc for cross-attention inputs
+        num_obj_ptr_tokens: int = 0,  # number of object pointer *tokens*
+    ):
+        if isinstance(curr, list):
+            assert isinstance(curr_pos, list)
+            assert len(curr) == len(curr_pos) == 1
+            curr, curr_pos = (
+                curr[0],
+                curr_pos[0],
+            )
+        assert (
+            curr.shape[1] == memory.shape[1]
+        ), "Batch size must be the same for curr and memory"
+        output = curr
+        if self.pos_enc_at_input and curr_pos is not None:
+            output = output + 0.1 * curr_pos
+        if self.batch_first:
+            # Convert to batch first
+            output = output.transpose(0, 1)
+            curr_pos = curr_pos.transpose(0, 1)
+            memory = memory.transpose(0, 1)
+            memory_pos = memory_pos.transpose(0, 1)
+        for layer in self.layers:
+            kwds = {}
+            if isinstance(layer.cross_attn_image, RoPEAttention):
+                kwds = {"num_k_exclude_rope": num_obj_ptr_tokens}
+            output = layer(
+                tgt=output,
+                memory=memory,
+                pos=memory_pos,
+                query_pos=curr_pos,
+                **kwds,
+            )
+        normed_output = self.norm(output)
+        if self.batch_first:
+            # Convert back to seq first
+            normed_output = normed_output.transpose(0, 1)
+            curr_pos = curr_pos.transpose(0, 1)
+        return normed_output

avs.code/v1m.code/model/visual/sam2/modeling/memory_encoder.py ADDED Viewed

	@@ -0,0 +1,181 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+from typing import Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from model.visual.sam2.modeling.sam2_utils import DropPath, get_clones, LayerNorm2d
+class MaskDownSampler(nn.Module):
+    """
+    Progressively downsample a mask by total_stride, each time by stride.
+    Note that LayerNorm is applied per *token*, like in ViT.
+    With each downsample (by a factor stride**2), channel capacity increases by the same factor.
+    In the end, we linearly project to embed_dim channels.
+    """
+    def __init__(
+        self,
+        embed_dim=256,
+        kernel_size=4,
+        stride=4,
+        padding=0,
+        total_stride=16,
+        activation=nn.GELU,
+    ):
+        super().__init__()
+        num_layers = int(math.log2(total_stride) // math.log2(stride))
+        assert stride**num_layers == total_stride
+        self.encoder = nn.Sequential()
+        mask_in_chans, mask_out_chans = 1, 1
+        for _ in range(num_layers):
+            mask_out_chans = mask_in_chans * (stride**2)
+            self.encoder.append(
+                nn.Conv2d(
+                    mask_in_chans,
+                    mask_out_chans,
+                    kernel_size=kernel_size,
+                    stride=stride,
+                    padding=padding,
+                )
+            )
+            self.encoder.append(LayerNorm2d(mask_out_chans))
+            self.encoder.append(activation())
+            mask_in_chans = mask_out_chans
+        self.encoder.append(nn.Conv2d(mask_out_chans, embed_dim, kernel_size=1))
+    def forward(self, x):
+        return self.encoder(x)
+# Lightly adapted from ConvNext (https://github.com/facebookresearch/ConvNeXt)
+class CXBlock(nn.Module):
+    r"""ConvNeXt Block. There are two equivalent implementations:
+    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
+    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
+    We use (2) as we find it slightly faster in PyTorch
+    Args:
+        dim (int): Number of input channels.
+        drop_path (float): Stochastic depth rate. Default: 0.0
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+    """
+    def __init__(
+        self,
+        dim,
+        kernel_size=7,
+        padding=3,
+        drop_path=0.0,
+        layer_scale_init_value=1e-6,
+        use_dwconv=True,
+    ):
+        super().__init__()
+        self.dwconv = nn.Conv2d(
+            dim,
+            dim,
+            kernel_size=kernel_size,
+            padding=padding,
+            groups=dim if use_dwconv else 1,
+        )  # depthwise conv
+        self.norm = LayerNorm2d(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(
+            dim, 4 * dim
+        )  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(4 * dim, dim)
+        self.gamma = (
+            nn.Parameter(layer_scale_init_value * torch.ones((dim)), requires_grad=True)
+            if layer_scale_init_value > 0
+            else None
+        )
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+    def forward(self, x):
+        input = x
+        x = self.dwconv(x)
+        x = self.norm(x)
+        x = x.permute(0, 2, 3, 1)  # (N, C, H, W) -> (N, H, W, C)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.permute(0, 3, 1, 2)  # (N, H, W, C) -> (N, C, H, W)
+        x = input + self.drop_path(x)
+        return x
+class Fuser(nn.Module):
+    def __init__(self, layer, num_layers, dim=None, input_projection=False):
+        super().__init__()
+        self.proj = nn.Identity()
+        self.layers = get_clones(layer, num_layers)
+        if input_projection:
+            assert dim is not None
+            self.proj = nn.Conv2d(dim, dim, kernel_size=1)
+    def forward(self, x):
+        # normally x: (N, C, H, W)
+        x = self.proj(x)
+        for layer in self.layers:
+            x = layer(x)
+        return x
+class MemoryEncoder(nn.Module):
+    def __init__(
+        self,
+        out_dim,
+        mask_downsampler,
+        fuser,
+        position_encoding,
+        in_dim=256,  # in_dim of pix_feats
+    ):
+        super().__init__()
+        self.mask_downsampler = mask_downsampler
+        self.pix_feat_proj = nn.Conv2d(in_dim, in_dim, kernel_size=1)
+        self.fuser = fuser
+        self.position_encoding = position_encoding
+        self.out_proj = nn.Identity()
+        if out_dim != in_dim:
+            self.out_proj = nn.Conv2d(in_dim, out_dim, kernel_size=1)
+    def forward(
+        self,
+        pix_feat: torch.Tensor,
+        masks: torch.Tensor,
+        skip_mask_sigmoid: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        ## Process masks
+        # sigmoid, so that less domain shift from gt masks which are bool
+        if not skip_mask_sigmoid:
+            masks = F.sigmoid(masks)
+        masks = self.mask_downsampler(masks)
+        ## Fuse pix_feats and downsampled masks
+        # in case the visual features are on CPU, cast them to CUDA
+        pix_feat = pix_feat.to(masks.device)
+        x = self.pix_feat_proj(pix_feat)
+        x = x + masks
+        x = self.fuser(x)
+        x = self.out_proj(x)
+        pos = self.position_encoding(x).to(x.dtype)
+        return {"vision_features": x, "vision_pos_enc": [pos]}

avs.code/v1m.code/model/visual/sam2/modeling/position_encoding.py ADDED Viewed

	@@ -0,0 +1,221 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+from typing import Any, Optional, Tuple
+import numpy as np
+import torch
+from torch import nn
+class PositionEmbeddingSine(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one
+    used by the Attention Is All You Need paper, generalized to work on images.
+    """
+    def __init__(
+        self,
+        num_pos_feats,
+        temperature: int = 10000,
+        normalize: bool = True,
+        scale: Optional[float] = None,
+    ):
+        super().__init__()
+        assert num_pos_feats % 2 == 0, "Expecting even model width"
+        self.num_pos_feats = num_pos_feats // 2
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+        self.cache = {}
+    def _encode_xy(self, x, y):
+        # The positions are expected to be normalized
+        assert len(x) == len(y) and x.ndim == y.ndim == 1
+        x_embed = x * self.scale
+        y_embed = y * self.scale
+        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
+        pos_x = x_embed[:, None] / dim_t
+        pos_y = y_embed[:, None] / dim_t
+        pos_x = torch.stack(
+            (pos_x[:, 0::2].sin(), pos_x[:, 1::2].cos()), dim=2
+        ).flatten(1)
+        pos_y = torch.stack(
+            (pos_y[:, 0::2].sin(), pos_y[:, 1::2].cos()), dim=2
+        ).flatten(1)
+        return pos_x, pos_y
+    @torch.no_grad()
+    def encode_boxes(self, x, y, w, h):
+        pos_x, pos_y = self._encode_xy(x, y)
+        pos = torch.cat((pos_y, pos_x, h[:, None], w[:, None]), dim=1)
+        return pos
+    encode = encode_boxes  # Backwards compatibility
+    @torch.no_grad()
+    def encode_points(self, x, y, labels):
+        (bx, nx), (by, ny), (bl, nl) = x.shape, y.shape, labels.shape
+        assert bx == by and nx == ny and bx == bl and nx == nl
+        pos_x, pos_y = self._encode_xy(x.flatten(), y.flatten())
+        pos_x, pos_y = pos_x.reshape(bx, nx, -1), pos_y.reshape(by, ny, -1)
+        pos = torch.cat((pos_y, pos_x, labels[:, :, None]), dim=2)
+        return pos
+    @torch.no_grad()
+    def forward(self, x: torch.Tensor):
+        cache_key = (x.shape[-2], x.shape[-1])
+        if cache_key in self.cache:
+            return self.cache[cache_key][None].repeat(x.shape[0], 1, 1, 1)
+        y_embed = (
+            torch.arange(1, x.shape[-2] + 1, dtype=torch.float32, device=x.device)
+            .view(1, -1, 1)
+            .repeat(x.shape[0], 1, x.shape[-1])
+        )
+        x_embed = (
+            torch.arange(1, x.shape[-1] + 1, dtype=torch.float32, device=x.device)
+            .view(1, 1, -1)
+            .repeat(x.shape[0], x.shape[-2], 1)
+        )
+        if self.normalize:
+            eps = 1e-6
+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack(
+            (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
+        ).flatten(3)
+        pos_y = torch.stack(
+            (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
+        ).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        self.cache[cache_key] = pos[0]
+        return pos
+class PositionEmbeddingRandom(nn.Module):
+    """
+    Positional encoding using random spatial frequencies.
+    """
+    def __init__(self, num_pos_feats: int = 64, scale: Optional[float] = None) -> None:
+        super().__init__()
+        if scale is None or scale <= 0.0:
+            scale = 1.0
+        self.register_buffer(
+            "positional_encoding_gaussian_matrix",
+            scale * torch.randn((2, num_pos_feats)),
+        )
+    def _pe_encoding(self, coords: torch.Tensor) -> torch.Tensor:
+        """Positionally encode points that are normalized to [0,1]."""
+        # assuming coords are in [0, 1]^2 square and have d_1 x ... x d_n x 2 shape
+        coords = 2 * coords - 1
+        coords = coords @ self.positional_encoding_gaussian_matrix
+        coords = 2 * np.pi * coords
+        # outputs d_1 x ... x d_n x C shape
+        return torch.cat([torch.sin(coords), torch.cos(coords)], dim=-1)
+    def forward(self, size: Tuple[int, int]) -> torch.Tensor:
+        """Generate positional encoding for a grid of the specified size."""
+        h, w = size
+        device: Any = self.positional_encoding_gaussian_matrix.device
+        grid = torch.ones((h, w), device=device, dtype=torch.float32)
+        y_embed = grid.cumsum(dim=0) - 0.5
+        x_embed = grid.cumsum(dim=1) - 0.5
+        y_embed = y_embed / h
+        x_embed = x_embed / w
+        pe = self._pe_encoding(torch.stack([x_embed, y_embed], dim=-1))
+        return pe.permute(2, 0, 1)  # C x H x W
+    def forward_with_coords(
+        self, coords_input: torch.Tensor, image_size: Tuple[int, int]
+    ) -> torch.Tensor:
+        """Positionally encode points that are not normalized to [0,1]."""
+        coords = coords_input.clone()
+        coords[:, :, 0] = coords[:, :, 0] / image_size[1]
+        coords[:, :, 1] = coords[:, :, 1] / image_size[0]
+        return self._pe_encoding(coords.to(torch.float))  # B x N x C
+# Rotary Positional Encoding, adapted from:
+# 1. https://github.com/meta-llama/codellama/blob/main/llama/model.py
+# 2. https://github.com/naver-ai/rope-vit
+# 3. https://github.com/lucidrains/rotary-embedding-torch
+def init_t_xy(end_x: int, end_y: int):
+    t = torch.arange(end_x * end_y, dtype=torch.float32)
+    t_x = (t % end_x).float()
+    t_y = torch.div(t, end_x, rounding_mode="floor").float()
+    return t_x, t_y
+def compute_axial_cis(dim: int, end_x: int, end_y: int, theta: float = 10000.0):
+    freqs_x = 1.0 / (theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim))
+    freqs_y = 1.0 / (theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim))
+    t_x, t_y = init_t_xy(end_x, end_y)
+    freqs_x = torch.outer(t_x, freqs_x)
+    freqs_y = torch.outer(t_y, freqs_y)
+    freqs_cis_x = torch.polar(torch.ones_like(freqs_x), freqs_x)
+    freqs_cis_y = torch.polar(torch.ones_like(freqs_y), freqs_y)
+    return torch.cat([freqs_cis_x, freqs_cis_y], dim=-1)
+def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
+    ndim = x.ndim
+    assert 0 <= 1 < ndim
+    assert freqs_cis.shape == (x.shape[-2], x.shape[-1])
+    shape = [d if i >= ndim - 2 else 1 for i, d in enumerate(x.shape)]
+    return freqs_cis.view(*shape)
+def apply_rotary_enc(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cis: torch.Tensor,
+    repeat_freqs_k: bool = False,
+):
+    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+    xk_ = (
+        torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+        if xk.shape[-2] != 0
+        else None
+    )
+    freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
+    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
+    if xk_ is None:
+        # no keys to rotate, due to dropout
+        return xq_out.type_as(xq).to(xq.device), xk
+    # repeat freqs along seq_len dim to match k seq_len
+    if repeat_freqs_k:
+        r = xk_.shape[-2] // xq_.shape[-2]
+        if freqs_cis.is_cuda:
+            freqs_cis = freqs_cis.repeat(*([1] * (freqs_cis.ndim - 2)), r, 1)
+        else:
+            # torch.repeat on complex numbers may not be supported on non-CUDA devices
+            # (freqs_cis has 4 dims and we repeat on dim 2) so we use expand + flatten
+            freqs_cis = freqs_cis.unsqueeze(2).expand(-1, -1, r, -1, -1).flatten(2, 3)
+    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
+    return xq_out.type_as(xq).to(xq.device), xk_out.type_as(xk).to(xk.device)

avs.code/v1m.code/model/visual/sam2/modeling/sam/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.

avs.code/v1m.code/model/visual/sam2/modeling/sam/mask_decoder.py ADDED Viewed

	@@ -0,0 +1,300 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import List, Optional, Tuple, Type
+import torch
+from torch import nn
+from model.visual.sam2.modeling.sam2_utils import LayerNorm2d, MLP
+class MaskDecoder(nn.Module):
+    def __init__(
+        self,
+        *,
+        transformer_dim: int,
+        transformer: nn.Module,
+        num_multimask_outputs: int = 3,
+        activation: Type[nn.Module] = nn.GELU,
+        iou_head_depth: int = 3,
+        iou_head_hidden_dim: int = 256,
+        use_high_res_features: bool = False,
+        iou_prediction_use_sigmoid=False,
+        dynamic_multimask_via_stability=False,
+        dynamic_multimask_stability_delta=0.05,
+        dynamic_multimask_stability_thresh=0.98,
+        pred_obj_scores: bool = False,
+        pred_obj_scores_mlp: bool = False,
+        use_multimask_token_for_obj_ptr: bool = False,
+    ) -> None:
+        """
+        Predicts masks given an image and prompt embeddings, using a
+        transformer architecture.
+        Arguments:
+          transformer_dim (int): the channel dimension of the transformer
+          transformer (nn.Module): the transformer used to predict masks
+          num_multimask_outputs (int): the number of masks to predict
+            when disambiguating masks
+          activation (nn.Module): the type of activation to use when
+            upscaling masks
+          iou_head_depth (int): the depth of the MLP used to predict
+            mask quality
+          iou_head_hidden_dim (int): the hidden dimension of the MLP
+            used to predict mask quality
+        """
+        super().__init__()
+        self.transformer_dim = transformer_dim
+        self.transformer = transformer
+        self.num_multimask_outputs = num_multimask_outputs
+        self.iou_token = nn.Embedding(1, transformer_dim)
+        self.num_mask_tokens = num_multimask_outputs + 1
+        self.mask_tokens = nn.Embedding(self.num_mask_tokens, transformer_dim)
+        self.pred_obj_scores = pred_obj_scores
+        if self.pred_obj_scores:
+            self.obj_score_token = nn.Embedding(1, transformer_dim)
+        self.use_multimask_token_for_obj_ptr = use_multimask_token_for_obj_ptr
+        self.output_upscaling = nn.Sequential(
+            nn.ConvTranspose2d(
+                transformer_dim, transformer_dim // 4, kernel_size=2, stride=2
+            ),
+            LayerNorm2d(transformer_dim // 4),
+            activation(),
+            nn.ConvTranspose2d(
+                transformer_dim // 4, transformer_dim // 8, kernel_size=2, stride=2
+            ),
+            activation(),
+        )
+        self.use_high_res_features = use_high_res_features
+        if use_high_res_features:
+            self.conv_s0 = nn.Conv2d(
+                transformer_dim, transformer_dim // 8, kernel_size=1, stride=1
+            )
+            self.conv_s1 = nn.Conv2d(
+                transformer_dim, transformer_dim // 4, kernel_size=1, stride=1
+            )
+        self.output_hypernetworks_mlps = nn.ModuleList(
+            [
+                MLP(transformer_dim, transformer_dim, transformer_dim // 8, 3)
+                for i in range(self.num_mask_tokens)
+            ]
+        )
+        self.iou_prediction_head = MLP(
+            transformer_dim,
+            iou_head_hidden_dim,
+            self.num_mask_tokens,
+            iou_head_depth,
+            sigmoid_output=iou_prediction_use_sigmoid,
+        )
+        if self.pred_obj_scores:
+            self.pred_obj_score_head = nn.Linear(transformer_dim, 1)
+            if pred_obj_scores_mlp:
+                self.pred_obj_score_head = MLP(transformer_dim, transformer_dim, 1, 3)
+        # When outputting a single mask, optionally we can dynamically fall back to the best
+        # multimask output token if the single mask output token gives low stability scores.
+        self.dynamic_multimask_via_stability = dynamic_multimask_via_stability
+        self.dynamic_multimask_stability_delta = dynamic_multimask_stability_delta
+        self.dynamic_multimask_stability_thresh = dynamic_multimask_stability_thresh
+    def forward(
+        self,
+        image_embeddings: torch.Tensor,
+        image_pe: torch.Tensor,
+        sparse_prompt_embeddings: torch.Tensor,
+        dense_prompt_embeddings: torch.Tensor,
+        multimask_output: bool,
+        repeat_image: bool,
+        high_res_features: Optional[List[torch.Tensor]] = None,
+        audio_res_features: Optional[List[torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Predict masks given image and prompt embeddings.
+        Arguments:
+          image_embeddings (torch.Tensor): the embeddings from the image encoder
+          image_pe (torch.Tensor): positional encoding with the shape of image_embeddings
+          sparse_prompt_embeddings (torch.Tensor): the embeddings of the points and boxes
+          dense_prompt_embeddings (torch.Tensor): the embeddings of the mask inputs
+          multimask_output (bool): Whether to return multiple masks or a single
+            mask.
+        Returns:
+          torch.Tensor: batched predicted masks
+          torch.Tensor: batched predictions of mask quality
+          torch.Tensor: batched SAM token for mask output
+        """
+        masks, iou_pred, mask_tokens_out, object_score_logits = self.predict_masks(
+            image_embeddings=image_embeddings,
+            image_pe=image_pe,
+            sparse_prompt_embeddings=sparse_prompt_embeddings,
+            dense_prompt_embeddings=dense_prompt_embeddings,
+            repeat_image=repeat_image,
+            high_res_features=high_res_features,
+            audio_res_features_=audio_res_features
+        )
+        # Select the correct mask or masks for output
+        if multimask_output:
+            masks = masks[:, 1:, :, :]
+            iou_pred = iou_pred[:, 1:]
+        elif self.dynamic_multimask_via_stability and not self.training:
+            masks, iou_pred = self._dynamic_multimask_via_stability(masks, iou_pred)
+        else:
+            masks = masks[:, 0:1, :, :]
+            iou_pred = iou_pred[:, 0:1]
+        if multimask_output and self.use_multimask_token_for_obj_ptr:
+            sam_tokens_out = mask_tokens_out[:, 1:]  # [b, 3, c] shape
+        else:
+            # Take the mask output token. Here we *always* use the token for single mask output.
+            # At test time, even if we track after 1-click (and using multimask_output=True),
+            # we still take the single mask token here. The rationale is that we always track
+            # after multiple clicks during training, so the past tokens seen during training
+            # are always the single mask token (and we'll let it be the object-memory token).
+            sam_tokens_out = mask_tokens_out[:, 0:1]  # [b, 1, c] shape
+        # Prepare output
+        return masks, iou_pred, sam_tokens_out, object_score_logits
+    def predict_masks(
+        self,
+        image_embeddings: torch.Tensor,
+        image_pe: torch.Tensor,
+        sparse_prompt_embeddings: torch.Tensor,
+        dense_prompt_embeddings: torch.Tensor,
+        repeat_image: bool,
+        high_res_features: Optional[List[torch.Tensor]] = None,
+        audio_res_features_: Optional[List[torch.Tensor]] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Predicts masks. See 'forward' for more details."""
+        # Concatenate output tokens
+        s = 0
+        if self.pred_obj_scores:
+            output_tokens = torch.cat(
+                [
+                    self.obj_score_token.weight,
+                    self.iou_token.weight,
+                    self.mask_tokens.weight,
+                ],
+                dim=0,
+            )
+            s = 1
+        else:
+            output_tokens = torch.cat(
+                [self.iou_token.weight, self.mask_tokens.weight], dim=0
+            )
+        output_tokens = output_tokens.unsqueeze(0).expand(
+            sparse_prompt_embeddings.size(0), -1, -1
+        )
+        tokens = torch.cat((output_tokens, sparse_prompt_embeddings), dim=1)
+        # Expand per-image data in batch direction to be per-mask
+        if repeat_image:
+            src = torch.repeat_interleave(image_embeddings, tokens.shape[0], dim=0)
+        else:
+            assert image_embeddings.shape[0] == tokens.shape[0]
+            src = image_embeddings
+        src = src + dense_prompt_embeddings
+        assert (
+            image_pe.size(0) == 1
+        ), "image_pe should have size 1 in batch dim (from `get_dense_pe()`)"
+        pos_src = torch.repeat_interleave(image_pe, tokens.shape[0], dim=0)
+        b, c, h, w = src.shape
+        # Run the transformer
+        hs, src = self.transformer(src, pos_src, tokens, audio_res_features_)
+        iou_token_out = hs[:, s, :]
+        mask_tokens_out = hs[:, s + 1 : (s + 1 + self.num_mask_tokens), :]
+        # Upscale mask embeddings and predict masks using the mask tokens
+        src = src.transpose(1, 2).view(b, c, h, w)
+        if not self.use_high_res_features:
+            upscaled_embedding = self.output_upscaling(src)
+        else:
+            dc1, ln1, act1, dc2, act2 = self.output_upscaling
+            feat_s0, feat_s1 = high_res_features
+            upscaled_embedding = act1(ln1(dc1(src) + feat_s1))
+            upscaled_embedding = act2(dc2(upscaled_embedding) + feat_s0)
+        hyper_in_list: List[torch.Tensor] = []
+        for i in range(self.num_mask_tokens):
+            hyper_in_list.append(
+                self.output_hypernetworks_mlps[i](mask_tokens_out[:, i, :])
+            )
+        hyper_in = torch.stack(hyper_in_list, dim=1)
+        b, c, h, w = upscaled_embedding.shape
+        masks = (hyper_in @ upscaled_embedding.view(b, c, h * w)).view(b, -1, h, w)
+        # Generate mask quality predictions
+        iou_pred = self.iou_prediction_head(iou_token_out)
+        if self.pred_obj_scores:
+            assert s == 1
+            object_score_logits = self.pred_obj_score_head(hs[:, 0, :])
+        else:
+            # Obj scores logits - default to 10.0, i.e. assuming the object is present, sigmoid(10)=1
+            object_score_logits = 10.0 * iou_pred.new_ones(iou_pred.shape[0], 1)
+        return masks, iou_pred, mask_tokens_out, object_score_logits
+    def _get_stability_scores(self, mask_logits):
+        """
+        Compute stability scores of the mask logits based on the IoU between upper and
+        lower thresholds.
+        """
+        mask_logits = mask_logits.flatten(-2)
+        stability_delta = self.dynamic_multimask_stability_delta
+        area_i = torch.sum(mask_logits > stability_delta, dim=-1).float()
+        area_u = torch.sum(mask_logits > -stability_delta, dim=-1).float()
+        stability_scores = torch.where(area_u > 0, area_i / area_u, 1.0)
+        return stability_scores
+    def _dynamic_multimask_via_stability(self, all_mask_logits, all_iou_scores):
+        """
+        When outputting a single mask, if the stability score from the current single-mask
+        output (based on output token 0) falls below a threshold, we instead select from
+        multi-mask outputs (based on output token 1~3) the mask with the highest predicted
+        IoU score. This is intended to ensure a valid mask for both clicking and tracking.
+        """
+        # The best mask from multimask output tokens (1~3)
+        multimask_logits = all_mask_logits[:, 1:, :, :]
+        multimask_iou_scores = all_iou_scores[:, 1:]
+        best_scores_inds = torch.argmax(multimask_iou_scores, dim=-1)
+        batch_inds = torch.arange(
+            multimask_iou_scores.size(0), device=all_iou_scores.device
+        )
+        best_multimask_logits = multimask_logits[batch_inds, best_scores_inds]
+        best_multimask_logits = best_multimask_logits.unsqueeze(1)
+        best_multimask_iou_scores = multimask_iou_scores[batch_inds, best_scores_inds]
+        best_multimask_iou_scores = best_multimask_iou_scores.unsqueeze(1)
+        # The mask from singlemask output token 0 and its stability score
+        singlemask_logits = all_mask_logits[:, 0:1, :, :]
+        singlemask_iou_scores = all_iou_scores[:, 0:1]
+        stability_scores = self._get_stability_scores(singlemask_logits)
+        is_stable = stability_scores >= self.dynamic_multimask_stability_thresh
+        # Dynamically fall back to best multimask output upon low stability scores.
+        mask_logits_out = torch.where(
+            is_stable[..., None, None].expand_as(singlemask_logits),
+            singlemask_logits,
+            best_multimask_logits,
+        )
+        iou_scores_out = torch.where(
+            is_stable.expand_as(singlemask_iou_scores),
+            singlemask_iou_scores,
+            best_multimask_iou_scores,
+        )
+        return mask_logits_out, iou_scores_out

avs.code/v1m.code/model/visual/sam2/modeling/sam/prompt_encoder.py ADDED Viewed

	@@ -0,0 +1,188 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Optional, Tuple, Type
+import torch
+from torch import nn
+from model.visual.sam2.modeling.position_encoding import PositionEmbeddingRandom
+from model.visual.sam2.modeling.sam2_utils import LayerNorm2d
+class PromptEncoder(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int,
+        image_embedding_size: Tuple[int, int],
+        input_image_size: Tuple[int, int],
+        mask_in_chans: int,
+        activation: Type[nn.Module] = nn.GELU,
+    ) -> None:
+        """
+        Encodes prompts for input to SAM's mask decoder.
+        Arguments:
+          embed_dim (int): The prompts' embedding dimension
+          image_embedding_size (tuple(int, int)): The spatial size of the
+            image embedding, as (H, W).
+          input_image_size (int): The padded size of the image as input
+            to the image encoder, as (H, W).
+          mask_in_chans (int): The number of hidden channels used for
+            encoding input masks.
+          activation (nn.Module): The activation to use when encoding
+            input masks.
+        """
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.input_image_size = input_image_size
+        self.image_embedding_size = image_embedding_size
+        self.pe_layer = PositionEmbeddingRandom(embed_dim // 2)
+        self.num_point_embeddings: int = 4  # pos/neg point + 2 box corners
+        point_embeddings = [
+            nn.Embedding(1, embed_dim) for i in range(self.num_point_embeddings)
+        ]
+        self.point_embeddings = nn.ModuleList(point_embeddings)
+        self.not_a_point_embed = nn.Embedding(1, embed_dim)
+        self.mask_input_size = (
+            4 * image_embedding_size[0],
+            4 * image_embedding_size[1],
+        )
+        self.mask_downscaling = nn.Sequential(
+            nn.Conv2d(1, mask_in_chans // 4, kernel_size=2, stride=2),
+            LayerNorm2d(mask_in_chans // 4),
+            activation(),
+            nn.Conv2d(mask_in_chans // 4, mask_in_chans, kernel_size=2, stride=2),
+            LayerNorm2d(mask_in_chans),
+            activation(),
+            nn.Conv2d(mask_in_chans, embed_dim, kernel_size=1),
+        )
+        self.no_mask_embed = nn.Embedding(1, embed_dim)
+    def get_dense_pe(self) -> torch.Tensor:
+        """
+        Returns the positional encoding used to encode point prompts,
+        applied to a dense set of points the shape of the image encoding.
+        Returns:
+          torch.Tensor: Positional encoding with shape
+            1x(embed_dim)x(embedding_h)x(embedding_w)
+        """
+        return self.pe_layer(self.image_embedding_size).unsqueeze(0)
+    def _embed_points(
+        self,
+        points: torch.Tensor,
+        labels: torch.Tensor,
+        pad: bool,
+    ) -> torch.Tensor:
+        """Embeds point prompts."""
+        points = points + 0.5  # Shift to center of pixel
+        if pad:
+            padding_point = torch.zeros((points.shape[0], 1, 2), device=points.device)
+            padding_label = -torch.ones((labels.shape[0], 1), device=labels.device)
+            points = torch.cat([points, padding_point], dim=1)
+            labels = torch.cat([labels, padding_label], dim=1)
+        point_embedding = self.pe_layer.forward_with_coords(
+            points, self.input_image_size
+        )
+        point_embedding[labels == -1] = 0.0
+        point_embedding[labels == -1] += self.not_a_point_embed.weight
+        point_embedding[labels == 0] += self.point_embeddings[0].weight
+        point_embedding[labels == 1] += self.point_embeddings[1].weight
+        point_embedding[labels == 2] += self.point_embeddings[2].weight
+        point_embedding[labels == 3] += self.point_embeddings[3].weight
+        return point_embedding
+    def _embed_boxes(self, boxes: torch.Tensor) -> torch.Tensor:
+        """Embeds box prompts."""
+        boxes = boxes + 0.5  # Shift to center of pixel
+        coords = boxes.reshape(-1, 2, 2)
+        corner_embedding = self.pe_layer.forward_with_coords(
+            coords, self.input_image_size
+        )
+        corner_embedding[:, 0, :] += self.point_embeddings[2].weight
+        corner_embedding[:, 1, :] += self.point_embeddings[3].weight
+        return corner_embedding
+    def _embed_masks(self, masks: torch.Tensor) -> torch.Tensor:
+        """Embeds mask inputs."""
+        mask_embedding = self.mask_downscaling(masks)
+        return mask_embedding
+    def _get_batch_size(
+        self,
+        points: Optional[Tuple[torch.Tensor, torch.Tensor]],
+        boxes: Optional[torch.Tensor],
+        masks: Optional[torch.Tensor],
+    ) -> int:
+        """
+        Gets the batch size of the output given the batch size of the input prompts.
+        """
+        if points is not None:
+            return points[0].shape[0]
+        elif boxes is not None:
+            return boxes.shape[0]
+        elif masks is not None:
+            return masks.shape[0]
+        else:
+            return 1
+    def _get_device(self) -> torch.device:
+        return self.point_embeddings[0].weight.device
+    def forward(
+        self,
+        points: Optional[Tuple[torch.Tensor, torch.Tensor]],
+        boxes: Optional[torch.Tensor],
+        masks: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Embeds different types of prompts, returning both sparse and dense
+        embeddings.
+        Arguments:
+          points (tuple(torch.Tensor, torch.Tensor) or none): point coordinates
+            and labels to embed.
+          boxes (torch.Tensor or none): boxes to embed
+          masks (torch.Tensor or none): masks to embed
+        Returns:
+          torch.Tensor: sparse embeddings for the points and boxes, with shape
+            BxNx(embed_dim), where N is determined by the number of input points
+            and boxes.
+          torch.Tensor: dense embeddings for the masks, in the shape
+            Bx(embed_dim)x(embed_H)x(embed_W)
+        """
+        # we only utilise sounding as prompt.
+        bs = self._get_batch_size(points, boxes, masks)
+        sparse_embeddings = torch.empty(
+            (bs, 0, self.embed_dim), device=self._get_device()
+        )
+        dense_embeddings = self.no_mask_embed.weight.reshape(1, -1, 1, 1).expand(
+            bs, -1, self.image_embedding_size[0], self.image_embedding_size[1]
+        )
+        '''
+        if points is not None:
+            coords, labels = points
+            point_embeddings = self._embed_points(coords, labels, pad=(boxes is None))
+            sparse_embeddings = torch.cat([sparse_embeddings, point_embeddings], dim=1)
+        if boxes is not None:
+            box_embeddings = self._embed_boxes(boxes)
+            sparse_embeddings = torch.cat([sparse_embeddings, box_embeddings], dim=1)
+        if masks is not None:
+            dense_embeddings = self._embed_masks(masks)
+        else:
+            dense_embeddings = self.no_mask_embed.weight.reshape(1, -1, 1, 1).expand(
+                bs, -1, self.image_embedding_size[0], self.image_embedding_size[1]
+            )
+        '''
+        return sparse_embeddings, dense_embeddings

avs.code/v1m.code/model/visual/sam2/modeling/sam/transformer.py ADDED Viewed

	@@ -0,0 +1,367 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import contextlib
+import math
+import warnings
+from functools import partial
+from typing import Tuple, Type
+import torch
+import torch.nn.functional as F
+from torch import nn, Tensor
+from model.visual.sam2.modeling.position_encoding import apply_rotary_enc, compute_axial_cis
+from model.visual.sam2.modeling.sam2_utils import MLP
+from model.visual.sam2.utils.misc import get_sdpa_settings
+warnings.simplefilter(action="ignore", category=FutureWarning)
+# Check whether Flash Attention is available (and use it by default)
+OLD_GPU, USE_FLASH_ATTN, MATH_KERNEL_ON = get_sdpa_settings()
+# A fallback setting to allow all available kernels if Flash Attention fails
+ALLOW_ALL_KERNELS = False
+def sdp_kernel_context(dropout_p):
+    """
+    Get the context for the attention scaled dot-product kernel. We use Flash Attention
+    by default, but fall back to all available kernels if Flash Attention fails.
+    """
+    if ALLOW_ALL_KERNELS:
+        return contextlib.nullcontext()
+    return torch.backends.cuda.sdp_kernel(
+        enable_flash=USE_FLASH_ATTN,
+        # if Flash attention kernel is off, then math kernel needs to be enabled
+        enable_math=(OLD_GPU and dropout_p > 0.0) or MATH_KERNEL_ON,
+        enable_mem_efficient=OLD_GPU,
+    )
+class TwoWayTransformer(nn.Module):
+    def __init__(
+        self,
+        depth: int,
+        embedding_dim: int,
+        num_heads: int,
+        mlp_dim: int,
+        activation: Type[nn.Module] = nn.ReLU,
+        attention_downsample_rate: int = 2,
+    ) -> None:
+        """
+        A transformer decoder that attends to an input image using
+        queries whose positional embedding is supplied.
+        Args:
+          depth (int): number of layers in the transformer
+          embedding_dim (int): the channel dimension for the input embeddings
+          num_heads (int): the number of heads for multihead attention. Must
+            divide embedding_dim
+          mlp_dim (int): the channel dimension internal to the MLP block
+          activation (nn.Module): the activation to use in the MLP block
+        """
+        super().__init__()
+        self.depth = depth
+        self.embedding_dim = embedding_dim
+        self.num_heads = num_heads
+        self.mlp_dim = mlp_dim
+        self.layers = nn.ModuleList()
+        for i in range(depth):
+            self.layers.append(
+                TwoWayAttentionBlock(
+                    embedding_dim=embedding_dim,
+                    num_heads=num_heads,
+                    mlp_dim=mlp_dim,
+                    activation=activation,
+                    attention_downsample_rate=attention_downsample_rate,
+                    skip_first_layer_pe=(i == 0),
+                )
+            )
+        self.final_attn_token_to_image = Attention(
+            embedding_dim, num_heads, downsample_rate=attention_downsample_rate
+        )
+        self.norm_final_attn = nn.LayerNorm(embedding_dim)
+    def forward(
+        self,
+        image_embedding: Tensor,
+        image_pe: Tensor,
+        point_embedding: Tensor,
+        audio_res: [],
+    ) -> Tuple[Tensor, Tensor]:
+        """
+        Args:
+          image_embedding (torch.Tensor): image to attend to. Should be shape
+            B x embedding_dim x h x w for any h and w.
+          image_pe (torch.Tensor): the positional encoding to add to the image. Must
+            have the same shape as image_embedding.
+          point_embedding (torch.Tensor): the embedding to add to the query points.
+            Must have shape B x N_points x embedding_dim for any N_points.
+        Returns:
+          torch.Tensor: the processed point_embedding
+          torch.Tensor: the processed image_embedding
+        """
+        # BxCxHxW -> BxHWxC == B x N_image_tokens x C
+        bs, c, h, w = image_embedding.shape
+        image_embedding = image_embedding.flatten(2).permute(0, 2, 1)
+        image_pe = image_pe.flatten(2).permute(0, 2, 1)
+        visual_res, audio_res = audio_res
+        # Prepare queries
+        queries = point_embedding
+        keys = image_embedding
+        # Apply transformer blocks and final layernorm
+        for i, layer in enumerate(self.layers):
+            keys = keys + visual_res[i]
+            queries[:, 2:6] = queries[:, 2:6] + audio_res[i]
+            queries, keys = layer(
+                queries=queries,
+                keys=keys,
+                query_pe=point_embedding,
+                key_pe=image_pe,
+            )
+        queries[:, 2:6] = queries[:, 2:6] + audio_res[-1]
+        keys = keys + visual_res[-1]
+        # Apply the final attention layer from the points to the image
+        q = queries + point_embedding
+        k = keys + image_pe
+        attn_out = self.final_attn_token_to_image(q=q, k=k, v=keys)
+        queries = queries + attn_out
+        queries = self.norm_final_attn(queries)
+        return queries, keys
+class TwoWayAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        num_heads: int,
+        mlp_dim: int = 2048,
+        activation: Type[nn.Module] = nn.ReLU,
+        attention_downsample_rate: int = 2,
+        skip_first_layer_pe: bool = False,
+    ) -> None:
+        """
+        A transformer block with four layers: (1) self-attention of sparse
+        inputs, (2) cross attention of sparse inputs to dense inputs, (3) mlp
+        block on sparse inputs, and (4) cross attention of dense inputs to sparse
+        inputs.
+        Arguments:
+          embedding_dim (int): the channel dimension of the embeddings
+          num_heads (int): the number of heads in the attention layers
+          mlp_dim (int): the hidden dimension of the mlp block
+          activation (nn.Module): the activation of the mlp block
+          skip_first_layer_pe (bool): skip the PE on the first layer
+        """
+        super().__init__()
+        self.self_attn = Attention(embedding_dim, num_heads)
+        self.norm1 = nn.LayerNorm(embedding_dim)
+        self.cross_attn_token_to_image = Attention(
+            embedding_dim, num_heads, downsample_rate=attention_downsample_rate
+        )
+        self.norm2 = nn.LayerNorm(embedding_dim)
+        self.mlp = MLP(
+            embedding_dim, mlp_dim, embedding_dim, num_layers=2, activation=activation
+        )
+        self.norm3 = nn.LayerNorm(embedding_dim)
+        self.norm4 = nn.LayerNorm(embedding_dim)
+        self.cross_attn_image_to_token = Attention(
+            embedding_dim, num_heads, downsample_rate=attention_downsample_rate
+        )
+        self.skip_first_layer_pe = skip_first_layer_pe
+    def forward(
+        self, queries: Tensor, keys: Tensor, query_pe: Tensor, key_pe: Tensor
+    ) -> Tuple[Tensor, Tensor]:
+        # Self attention block
+        if self.skip_first_layer_pe:
+            queries = self.self_attn(q=queries, k=queries, v=queries)
+        else:
+            q = queries + query_pe
+            attn_out = self.self_attn(q=q, k=q, v=queries)
+            queries = queries + attn_out
+        queries = self.norm1(queries)
+        # Cross attention block, tokens attending to image embedding
+        q = queries + query_pe
+        k = keys + key_pe
+        attn_out = self.cross_attn_token_to_image(q=q, k=k, v=keys)
+        queries = queries + attn_out
+        queries = self.norm2(queries)
+        # MLP block
+        mlp_out = self.mlp(queries)
+        queries = queries + mlp_out
+        queries = self.norm3(queries)
+        # Cross attention block, image embedding attending to tokens
+        q = queries + query_pe
+        k = keys + key_pe
+        attn_out = self.cross_attn_image_to_token(q=k, k=q, v=queries)
+        keys = keys + attn_out
+        keys = self.norm4(keys)
+        return queries, keys
+class Attention(nn.Module):
+    """
+    An attention layer that allows for downscaling the size of the embedding
+    after projection to queries, keys, and values.
+    """
+    def __init__(
+        self,
+        embedding_dim: int,
+        num_heads: int,
+        downsample_rate: int = 1,
+        dropout: float = 0.0,
+        kv_in_dim: int = None,
+    ) -> None:
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.kv_in_dim = kv_in_dim if kv_in_dim is not None else embedding_dim
+        self.internal_dim = embedding_dim // downsample_rate
+        self.num_heads = num_heads
+        assert (
+            self.internal_dim % num_heads == 0
+        ), "num_heads must divide embedding_dim."
+        self.q_proj = nn.Linear(embedding_dim, self.internal_dim)
+        self.k_proj = nn.Linear(self.kv_in_dim, self.internal_dim)
+        self.v_proj = nn.Linear(self.kv_in_dim, self.internal_dim)
+        self.out_proj = nn.Linear(self.internal_dim, embedding_dim)
+        self.dropout_p = dropout
+    def _separate_heads(self, x: Tensor, num_heads: int) -> Tensor:
+        b, n, c = x.shape
+        x = x.reshape(b, n, num_heads, c // num_heads)
+        return x.transpose(1, 2)  # B x N_heads x N_tokens x C_per_head
+    def _recombine_heads(self, x: Tensor) -> Tensor:
+        b, n_heads, n_tokens, c_per_head = x.shape
+        x = x.transpose(1, 2)
+        return x.reshape(b, n_tokens, n_heads * c_per_head)  # B x N_tokens x C
+    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor:
+        # Input projections
+        q = self.q_proj(q)
+        k = self.k_proj(k)
+        v = self.v_proj(v)
+        # Separate into heads
+        q = self._separate_heads(q, self.num_heads)
+        k = self._separate_heads(k, self.num_heads)
+        v = self._separate_heads(v, self.num_heads)
+        dropout_p = self.dropout_p if self.training else 0.0
+        # Attention
+        try:
+            with sdp_kernel_context(dropout_p):
+                out = F.scaled_dot_product_attention(q, k, v, dropout_p=dropout_p)
+        except Exception as e:
+            # Fall back to all kernels if the Flash attention kernel fails
+            warnings.warn(
+                f"Flash Attention kernel failed due to: {e}\nFalling back to all available "
+                f"kernels for scaled_dot_product_attention (which may have a slower speed).",
+                category=UserWarning,
+                stacklevel=2,
+            )
+            global ALLOW_ALL_KERNELS
+            ALLOW_ALL_KERNELS = True
+            out = F.scaled_dot_product_attention(q, k, v, dropout_p=dropout_p)
+        out = self._recombine_heads(out)
+        out = self.out_proj(out)
+        return out
+class RoPEAttention(Attention):
+    """Attention with rotary position encoding."""
+    def __init__(
+        self,
+        *args,
+        rope_theta=10000.0,
+        # whether to repeat q rope to match k length
+        # this is needed for cross-attention to memories
+        rope_k_repeat=False,
+        feat_sizes=(32, 32),  # [w, h] for stride 16 feats at 512 resolution
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.compute_cis = partial(
+            compute_axial_cis, dim=self.internal_dim // self.num_heads, theta=rope_theta
+        )
+        freqs_cis = self.compute_cis(end_x=feat_sizes[0], end_y=feat_sizes[1])
+        self.freqs_cis = freqs_cis
+        self.rope_k_repeat = rope_k_repeat
+    def forward(
+        self, q: Tensor, k: Tensor, v: Tensor, num_k_exclude_rope: int = 0
+    ) -> Tensor:
+        # Input projections
+        q = self.q_proj(q)
+        k = self.k_proj(k)
+        v = self.v_proj(v)
+        # Separate into heads
+        q = self._separate_heads(q, self.num_heads)
+        k = self._separate_heads(k, self.num_heads)
+        v = self._separate_heads(v, self.num_heads)
+        # Apply rotary position encoding
+        w = h = math.sqrt(q.shape[-2])
+        self.freqs_cis = self.freqs_cis.to(q.device)
+        if self.freqs_cis.shape[0] != q.shape[-2]:
+            self.freqs_cis = self.compute_cis(end_x=w, end_y=h).to(q.device)
+        if q.shape[-2] != k.shape[-2]:
+            assert self.rope_k_repeat
+        num_k_rope = k.size(-2) - num_k_exclude_rope
+        q, k[:, :, :num_k_rope] = apply_rotary_enc(
+            q,
+            k[:, :, :num_k_rope],
+            freqs_cis=self.freqs_cis,
+            repeat_freqs_k=self.rope_k_repeat,
+        )
+        dropout_p = self.dropout_p if self.training else 0.0
+        # Attention
+        try:
+            with sdp_kernel_context(dropout_p):
+                out = F.scaled_dot_product_attention(q, k, v, dropout_p=dropout_p)
+        except Exception as e:
+            # Fall back to all kernels if the Flash attention kernel fails
+            warnings.warn(
+                f"Flash Attention kernel failed due to: {e}\nFalling back to all available "
+                f"kernels for scaled_dot_product_attention (which may have a slower speed).",
+                category=UserWarning,
+                stacklevel=2,
+            )
+            global ALLOW_ALL_KERNELS
+            ALLOW_ALL_KERNELS = True
+            out = F.scaled_dot_product_attention(q, k, v, dropout_p=dropout_p)
+        out = self._recombine_heads(out)
+        out = self.out_proj(out)
+        return out

avs.code/v1m.code/model/visual/sam2/modeling/sam2_base.py ADDED Viewed

	@@ -0,0 +1,940 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.distributed
+import torch.nn.functional as F
+from torch.nn.init import trunc_normal_
+from model.visual.sam2.modeling.sam.mask_decoder import MaskDecoder
+from model.visual.sam2.modeling.sam.prompt_encoder import PromptEncoder
+from model.visual.sam2.modeling.sam.transformer import TwoWayTransformer
+from model.visual.sam2.modeling.sam2_utils import get_1d_sine_pe, MLP, select_closest_cond_frames
+# a large negative value as a placeholder score for missing objects
+NO_OBJ_SCORE = -1024.0
+class SAM2Base(torch.nn.Module):
+    def __init__(
+        self,
+        image_encoder,
+        memory_attention,
+        memory_encoder,
+        num_maskmem=7,  # default 1 input frame + 6 previous frames
+        image_size=512,
+        backbone_stride=16,  # stride of the image backbone output
+        sigmoid_scale_for_mem_enc=1.0,  # scale factor for mask sigmoid prob
+        sigmoid_bias_for_mem_enc=0.0,  # bias factor for mask sigmoid prob
+        # During evaluation, whether to binarize the sigmoid mask logits on interacted frames with clicks
+        binarize_mask_from_pts_for_mem_enc=False,
+        use_mask_input_as_output_without_sam=False,  # on frames with mask input, whether to directly output the input mask without using a SAM prompt encoder + mask decoder
+        # The maximum number of conditioning frames to participate in the memory attention (-1 means no limit; if there are more conditioning frames than this limit,
+        # we only cross-attend to the temporally closest `max_cond_frames_in_attn` conditioning frames in the encoder when tracking each frame). This gives the model
+        # a temporal locality when handling a large number of annotated frames (since closer frames should be more important) and also avoids GPU OOM.
+        max_cond_frames_in_attn=-1,
+        # on the first frame, whether to directly add the no-memory embedding to the image feature
+        # (instead of using the transformer encoder)
+        directly_add_no_mem_embed=False,
+        # whether to use high-resolution feature maps in the SAM mask decoder
+        use_high_res_features_in_sam=False,
+        # whether to output multiple (3) masks for the first click on initial conditioning frames
+        multimask_output_in_sam=False,
+        # the minimum and maximum number of clicks to use multimask_output_in_sam (only relevant when `multimask_output_in_sam=True`;
+        # default is 1 for both, meaning that only the first click gives multimask output; also note that a box counts as two points)
+        multimask_min_pt_num=1,
+        multimask_max_pt_num=1,
+        # whether to also use multimask output for tracking (not just for the first click on initial conditioning frames; only relevant when `multimask_output_in_sam=True`)
+        multimask_output_for_tracking=False,
+        # Whether to use multimask tokens for obj ptr; Only relevant when both
+        # use_obj_ptrs_in_encoder=True and multimask_output_for_tracking=True
+        use_multimask_token_for_obj_ptr: bool = False,
+        # whether to use sigmoid to restrict ious prediction to [0-1]
+        iou_prediction_use_sigmoid=False,
+        # The memory bank's temporal stride during evaluation (i.e. the `r` parameter in XMem and Cutie; XMem and Cutie use r=5).
+        # For r>1, the (self.num_maskmem - 1) non-conditioning memory frames consist of
+        # (self.num_maskmem - 2) nearest frames from every r-th frames, plus the last frame.
+        memory_temporal_stride_for_eval=1,
+        # whether to apply non-overlapping constraints on the object masks in the memory encoder during evaluation (to avoid/alleviate superposing masks)
+        non_overlap_masks_for_mem_enc=False,
+        # whether to cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+        use_obj_ptrs_in_encoder=False,
+        # the maximum number of object pointers from other frames in encoder cross attention (only relevant when `use_obj_ptrs_in_encoder=True`)
+        max_obj_ptrs_in_encoder=16,
+        # whether to add temporal positional encoding to the object pointers in the encoder (only relevant when `use_obj_ptrs_in_encoder=True`)
+        add_tpos_enc_to_obj_ptrs=True,
+        # whether to add an extra linear projection layer for the temporal positional encoding in the object pointers to avoid potential interference
+        # with spatial positional encoding (only relevant when both `use_obj_ptrs_in_encoder=True` and `add_tpos_enc_to_obj_ptrs=True`)
+        proj_tpos_enc_in_obj_ptrs=False,
+        # whether to use signed distance (instead of unsigned absolute distance) in the temporal positional encoding in the object pointers
+        # (only relevant when both `use_obj_ptrs_in_encoder=True` and `add_tpos_enc_to_obj_ptrs=True`)
+        use_signed_tpos_enc_to_obj_ptrs=False,
+        # whether to only attend to object pointers in the past (before the current frame) in the encoder during evaluation
+        # (only relevant when `use_obj_ptrs_in_encoder=True`; this might avoid pointer information too far in the future to distract the initial tracking)
+        only_obj_ptrs_in_the_past_for_eval=False,
+        # Whether to predict if there is an object in the frame
+        pred_obj_scores: bool = False,
+        # Whether to use an MLP to predict object scores
+        pred_obj_scores_mlp: bool = False,
+        # Only relevant if pred_obj_scores=True and use_obj_ptrs_in_encoder=True;
+        # Whether to have a fixed no obj pointer when there is no object present
+        # or to use it as an additive embedding with obj_ptr produced by decoder
+        fixed_no_obj_ptr: bool = False,
+        # Soft no object, i.e. mix in no_obj_ptr softly,
+        # hope to make recovery easier if there is a mistake and mitigate accumulation of errors
+        soft_no_obj_ptr: bool = False,
+        use_mlp_for_obj_ptr_proj: bool = False,
+        # add no obj embedding to spatial frames
+        no_obj_embed_spatial: bool = False,
+        # extra arguments used to construct the SAM mask decoder; if not None, it should be a dict of kwargs to be passed into `MaskDecoder` class.
+        sam_mask_decoder_extra_args=None,
+        compile_image_encoder: bool = False,
+    ):
+        super().__init__()
+        # Part 1: the image backbone
+        self.image_encoder = image_encoder
+        # Use level 0, 1, 2 for high-res setting, or just level 2 for the default setting
+        self.use_high_res_features_in_sam = use_high_res_features_in_sam
+        self.num_feature_levels = 3 if use_high_res_features_in_sam else 1
+        self.use_obj_ptrs_in_encoder = use_obj_ptrs_in_encoder
+        self.max_obj_ptrs_in_encoder = max_obj_ptrs_in_encoder
+        if use_obj_ptrs_in_encoder:
+            # A conv layer to downsample the mask prompt to stride 4 (the same stride as
+            # low-res SAM mask logits) and to change its scales from 0~1 to SAM logit scale,
+            # so that it can be fed into the SAM mask decoder to generate a pointer.
+            self.mask_downsample = torch.nn.Conv2d(1, 1, kernel_size=4, stride=4)
+        self.add_tpos_enc_to_obj_ptrs = add_tpos_enc_to_obj_ptrs
+        if proj_tpos_enc_in_obj_ptrs:
+            assert add_tpos_enc_to_obj_ptrs  # these options need to be used together
+        self.proj_tpos_enc_in_obj_ptrs = proj_tpos_enc_in_obj_ptrs
+        self.use_signed_tpos_enc_to_obj_ptrs = use_signed_tpos_enc_to_obj_ptrs
+        self.only_obj_ptrs_in_the_past_for_eval = only_obj_ptrs_in_the_past_for_eval
+        # Part 2: memory attention to condition current frame's visual features
+        # with memories (and obj ptrs) from past frames
+        self.memory_attention = memory_attention
+        #### this is for Version 2.0
+        # self.hidden_dim = memory_attention.d_model
+        #### this is for Version 2.1
+        # self.hidden_dim = image_encoder.neck.d_model
+        self.hidden_dim = 256 # well, it is always 256 anyway.
+        # Part 3: memory encoder for the previous frame's outputs
+        self.memory_encoder = memory_encoder
+        self.mem_dim = self.hidden_dim
+        if hasattr(self.memory_encoder, "out_proj") and hasattr(
+            self.memory_encoder.out_proj, "weight"
+        ):
+            # if there is compression of memories along channel dim
+            self.mem_dim = self.memory_encoder.out_proj.weight.shape[0]
+        self.num_maskmem = num_maskmem  # Number of memories accessible
+        # Temporal encoding of the memories
+        self.maskmem_tpos_enc = torch.nn.Parameter(
+            torch.zeros(num_maskmem, 1, 1, self.mem_dim)
+        )
+        trunc_normal_(self.maskmem_tpos_enc, std=0.02)
+        # a single token to indicate no memory embedding from previous frames
+        self.no_mem_embed = torch.nn.Parameter(torch.zeros(1, 1, self.hidden_dim))
+        self.no_mem_pos_enc = torch.nn.Parameter(torch.zeros(1, 1, self.hidden_dim))
+        trunc_normal_(self.no_mem_embed, std=0.02)
+        trunc_normal_(self.no_mem_pos_enc, std=0.02)
+        self.directly_add_no_mem_embed = directly_add_no_mem_embed
+        # Apply sigmoid to the output raw mask logits (to turn them from
+        # range (-inf, +inf) to range (0, 1)) before feeding them into the memory encoder
+        self.sigmoid_scale_for_mem_enc = sigmoid_scale_for_mem_enc
+        self.sigmoid_bias_for_mem_enc = sigmoid_bias_for_mem_enc
+        self.binarize_mask_from_pts_for_mem_enc = binarize_mask_from_pts_for_mem_enc
+        self.non_overlap_masks_for_mem_enc = non_overlap_masks_for_mem_enc
+        self.memory_temporal_stride_for_eval = memory_temporal_stride_for_eval
+        # On frames with mask input, whether to directly output the input mask without
+        # using a SAM prompt encoder + mask decoder
+        self.use_mask_input_as_output_without_sam = use_mask_input_as_output_without_sam
+        self.multimask_output_in_sam = multimask_output_in_sam
+        self.multimask_min_pt_num = multimask_min_pt_num
+        self.multimask_max_pt_num = multimask_max_pt_num
+        self.multimask_output_for_tracking = multimask_output_for_tracking
+        self.use_multimask_token_for_obj_ptr = use_multimask_token_for_obj_ptr
+        self.iou_prediction_use_sigmoid = iou_prediction_use_sigmoid
+        # Part 4: SAM-style prompt encoder (for both mask and point inputs)
+        # and SAM-style mask decoder for the final mask output
+        self.image_size = image_size
+        self.backbone_stride = backbone_stride
+        self.sam_mask_decoder_extra_args = sam_mask_decoder_extra_args
+        self.pred_obj_scores = pred_obj_scores
+        self.pred_obj_scores_mlp = pred_obj_scores_mlp
+        self.fixed_no_obj_ptr = fixed_no_obj_ptr
+        self.soft_no_obj_ptr = soft_no_obj_ptr
+        if self.fixed_no_obj_ptr:
+            assert self.pred_obj_scores
+            assert self.use_obj_ptrs_in_encoder
+        if self.pred_obj_scores and self.use_obj_ptrs_in_encoder:
+            self.no_obj_ptr = torch.nn.Parameter(torch.zeros(1, self.hidden_dim))
+            trunc_normal_(self.no_obj_ptr, std=0.02)
+        self.use_mlp_for_obj_ptr_proj = use_mlp_for_obj_ptr_proj
+        self.no_obj_embed_spatial = None
+        if no_obj_embed_spatial:
+            self.no_obj_embed_spatial = torch.nn.Parameter(torch.zeros(1, self.mem_dim))
+            trunc_normal_(self.no_obj_embed_spatial, std=0.02)
+        self._build_sam_heads()
+        self.max_cond_frames_in_attn = max_cond_frames_in_attn
+        # Model compilation
+        if compile_image_encoder:
+            # Compile the forward function (not the full module) to allow loading checkpoints.
+            print(
+                "Image encoder compilation is enabled. First forward pass will be slow."
+            )
+            self.image_encoder.forward = torch.compile(
+                self.image_encoder.forward,
+                mode="max-autotune",
+                fullgraph=True,
+                dynamic=False,
+            )
+        ### we fix the use_mask_input_as_output_without_sam to be turned off.
+        self.use_mask_input_as_output_without_sam = False
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    def forward(self, *args, **kwargs):
+        raise NotImplementedError(
+            "Please use the corresponding methods in SAM2VideoPredictor for inference or SAM2Train for training/fine-tuning"
+            "See notebooks/video_predictor_example.ipynb for an inference example."
+        )
+    def _build_sam_heads(self):
+        """Build SAM-style prompt encoder and mask decoder."""
+        self.sam_prompt_embed_dim = self.hidden_dim
+        self.sam_image_embedding_size = self.image_size // self.backbone_stride
+        # build PromptEncoder and MaskDecoder from SAM
+        # (their hyperparameters like `mask_in_chans=16` are from SAM code)
+        self.sam_prompt_encoder = PromptEncoder(
+            embed_dim=self.sam_prompt_embed_dim,
+            image_embedding_size=(
+                self.sam_image_embedding_size,
+                self.sam_image_embedding_size,
+            ),
+            input_image_size=(self.image_size, self.image_size),
+            mask_in_chans=16,
+        )
+        self.sam_mask_decoder = MaskDecoder(
+            num_multimask_outputs=3,
+            transformer=TwoWayTransformer(
+                depth=2,
+                embedding_dim=self.sam_prompt_embed_dim,
+                mlp_dim=2048,
+                num_heads=8,
+            ),
+            transformer_dim=self.sam_prompt_embed_dim,
+            iou_head_depth=3,
+            iou_head_hidden_dim=256,
+            use_high_res_features=self.use_high_res_features_in_sam,
+            iou_prediction_use_sigmoid=self.iou_prediction_use_sigmoid,
+            pred_obj_scores=self.pred_obj_scores,
+            pred_obj_scores_mlp=self.pred_obj_scores_mlp,
+            use_multimask_token_for_obj_ptr=self.use_multimask_token_for_obj_ptr,
+            **(self.sam_mask_decoder_extra_args or {}),
+        )
+        if self.use_obj_ptrs_in_encoder:
+            # a linear projection on SAM output tokens to turn them into object pointers
+            self.obj_ptr_proj = torch.nn.Linear(self.hidden_dim, self.hidden_dim)
+            if self.use_mlp_for_obj_ptr_proj:
+                self.obj_ptr_proj = MLP(
+                    self.hidden_dim, self.hidden_dim, self.hidden_dim, 3
+                )
+        else:
+            self.obj_ptr_proj = torch.nn.Identity()
+        if self.proj_tpos_enc_in_obj_ptrs:
+            # a linear projection on temporal positional encoding in object pointers to
+            # avoid potential interference with spatial positional encoding
+            self.obj_ptr_tpos_proj = torch.nn.Linear(self.hidden_dim, self.mem_dim)
+        else:
+            self.obj_ptr_tpos_proj = torch.nn.Identity()
+    def _forward_sam_heads(
+        self,
+        backbone_features,
+        point_inputs=None,
+        mask_inputs=None,
+        high_res_features=None,
+        multimask_output=False,
+        audio_res=None
+    ):
+        """
+        Forward SAM prompt encoders and mask heads.
+        Inputs:
+        - backbone_features: image features of [B, C, H, W] shape
+        - point_inputs: a dictionary with "point_coords" and "point_labels", where
+          1) "point_coords" has [B, P, 2] shape and float32 dtype and contains the
+             absolute pixel-unit coordinate in (x, y) format of the P input points
+          2) "point_labels" has shape [B, P] and int32 dtype, where 1 means
+             positive clicks, 0 means negative clicks, and -1 means padding
+        - mask_inputs: a mask of [B, 1, H*16, W*16] shape, float or bool, with the
+          same spatial size as the image.
+        - high_res_features: either 1) None or 2) or a list of length 2 containing
+          two feature maps of [B, C, 4*H, 4*W] and [B, C, 2*H, 2*W] shapes respectively,
+          which will be used as high-resolution feature maps for SAM decoder.
+        - multimask_output: if it's True, we output 3 candidate masks and their 3
+          corresponding IoU estimates, and if it's False, we output only 1 mask and
+          its corresponding IoU estimate.
+        Outputs:
+        - low_res_multimasks: [B, M, H*4, W*4] shape (where M = 3 if
+          `multimask_output=True` and M = 1 if `multimask_output=False`), the SAM
+          output mask logits (before sigmoid) for the low-resolution masks, with 4x
+          the resolution (1/4 stride) of the input backbone_features.
+        - high_res_multimasks: [B, M, H*16, W*16] shape (where M = 3
+          if `multimask_output=True` and M = 1 if `multimask_output=False`),
+          upsampled from the low-resolution masks, with shape size as the image
+          (stride is 1 pixel).
+        - ious, [B, M] shape, where (where M = 3 if `multimask_output=True` and M = 1
+          if `multimask_output=False`), the estimated IoU of each output mask.
+        - low_res_masks: [B, 1, H*4, W*4] shape, the best mask in `low_res_multimasks`.
+          If `multimask_output=True`, it's the mask with the highest IoU estimate.
+          If `multimask_output=False`, it's the same as `low_res_multimasks`.
+        - high_res_masks: [B, 1, H*16, W*16] shape, the best mask in `high_res_multimasks`.
+          If `multimask_output=True`, it's the mask with the highest IoU estimate.
+          If `multimask_output=False`, it's the same as `high_res_multimasks`.
+        - obj_ptr: [B, C] shape, the object pointer vector for the output mask, extracted
+          based on the output token from the SAM mask decoder.
+        """
+        B = backbone_features.size(0)
+        device = backbone_features.device
+        assert backbone_features.size(1) == self.sam_prompt_embed_dim
+        assert backbone_features.size(2) == self.sam_image_embedding_size
+        assert backbone_features.size(3) == self.sam_image_embedding_size
+        '''
+        # a) Handle point prompts
+        if point_inputs is not None:
+            sam_point_coords = point_inputs["point_coords"]
+            sam_point_labels = point_inputs["point_labels"]
+            assert sam_point_coords.size(0) == B and sam_point_labels.size(0) == B
+            raise NotImplementedError
+        else:
+            # If no points are provide, pad with an empty point (with label -1)
+            sam_point_coords = torch.zeros(B, 1, 2, device=device)
+            sam_point_labels = -torch.ones(B, 1, dtype=torch.int32, device=device)
+        # b) Handle mask prompts
+        if mask_inputs is not None:
+            # If mask_inputs is provided, downsize it into low-res mask input if needed
+            # and feed it as a dense mask prompt into the SAM mask encoder
+            assert len(mask_inputs.shape) == 4 and mask_inputs.shape[:2] == (B, 1)
+            if mask_inputs.shape[-2:] != self.sam_prompt_encoder.mask_input_size:
+                sam_mask_prompt = F.interpolate(
+                    mask_inputs.float(),
+                    size=self.sam_prompt_encoder.mask_input_size,
+                    align_corners=False,
+                    mode="bilinear",
+                    antialias=True,  # use antialias for downsampling
+                )
+            else:
+                sam_mask_prompt = mask_inputs
+            raise NotImplementedError
+        else:
+            # Otherwise, simply feed None (and SAM's prompt encoder will add
+            # a learned `no_mask_embed` to indicate no mask input in this case).
+            sam_mask_prompt = None
+        sparse_embeddings, dense_embeddings = self.sam_prompt_encoder(
+            points=(sam_point_coords, sam_point_labels),
+            boxes=None,
+            masks=sam_mask_prompt,
+        )
+        '''
+        sparse_embeddings, dense_embeddings = self.sam_prompt_encoder(
+            points=None,
+            boxes=None,
+            masks=None,
+        )
+        (
+            low_res_multimasks,
+            ious,
+            sam_output_tokens,
+            object_score_logits,
+        ) = self.sam_mask_decoder(
+            image_embeddings=backbone_features,
+            image_pe=self.sam_prompt_encoder.get_dense_pe(),
+            sparse_prompt_embeddings=sparse_embeddings,
+            dense_prompt_embeddings=dense_embeddings,
+            multimask_output=multimask_output,
+            repeat_image=False,  # the image is already batched
+            high_res_features=high_res_features,
+            audio_res_features=audio_res
+        )
+        '''
+        if self.pred_obj_scores:
+            is_obj_appearing = object_score_logits > 0
+            # Mask used for spatial memories is always a *hard* choice between obj and no obj,
+            # consistent with the actual mask prediction
+            low_res_multimasks = torch.where(
+                is_obj_appearing[:, None, None],
+                low_res_multimasks,
+                NO_OBJ_SCORE,
+            )
+        '''
+        # convert masks from possibly bfloat16 (or float16) to float32
+        # (older PyTorch versions before 2.1 don't support `interpolate` on bf16)
+        low_res_multimasks = low_res_multimasks.float()
+        high_res_multimasks = F.interpolate(
+            low_res_multimasks,
+            size=(self.image_size, self.image_size),
+            mode="bilinear",
+            align_corners=False,
+        )
+        sam_output_token = sam_output_tokens[:, 0]
+        if multimask_output:
+            # comment this line temporarily.
+            # take the best mask prediction (with the highest IoU estimation)
+            best_iou_inds = torch.argmax(ious, dim=-1)
+            batch_inds = torch.arange(B, device=device)
+            low_res_masks = low_res_multimasks[batch_inds, best_iou_inds].unsqueeze(1)
+            high_res_masks = high_res_multimasks[batch_inds, best_iou_inds].unsqueeze(1)
+            if sam_output_tokens.size(1) > 1:
+                sam_output_token = sam_output_tokens[batch_inds, best_iou_inds]
+        else:
+            low_res_masks, high_res_masks = low_res_multimasks, high_res_multimasks
+        # Extract object pointer from the SAM output token (with occlusion handling)
+        obj_ptr = self.obj_ptr_proj(sam_output_token)
+        # don't train occlusion at the moment, command temporarily.
+        if self.pred_obj_scores:
+            is_obj_appearing = object_score_logits > 0
+            # Allow *soft* no obj ptr, unlike for masks
+            if self.soft_no_obj_ptr:
+                lambda_is_obj_appearing = object_score_logits.sigmoid()
+            else:
+                lambda_is_obj_appearing = is_obj_appearing.float()
+            if self.fixed_no_obj_ptr:
+                obj_ptr = lambda_is_obj_appearing * obj_ptr
+            obj_ptr = obj_ptr + (1 - lambda_is_obj_appearing) * self.no_obj_ptr
+        return (
+            low_res_multimasks,
+            high_res_multimasks,
+            ious,
+            low_res_masks,
+            high_res_masks,
+            obj_ptr,
+            object_score_logits,
+        )
+    def _use_mask_as_output(self, backbone_features, high_res_features, mask_inputs):
+        """
+        Directly turn binary `mask_inputs` into a output mask logits without using SAM.
+        (same input and output shapes as in _forward_sam_heads above).
+        """
+        # Use -10/+10 as logits for neg/pos pixels (very close to 0/1 in prob after sigmoid).
+        out_scale, out_bias = 20.0, -10.0  # sigmoid(-10.0)=4.5398e-05
+        mask_inputs_float = mask_inputs.float()
+        high_res_masks = mask_inputs_float * out_scale + out_bias
+        low_res_masks = F.interpolate(
+            high_res_masks,
+            size=(high_res_masks.size(-2) // 4, high_res_masks.size(-1) // 4),
+            align_corners=False,
+            mode="bilinear",
+            antialias=True,  # use antialias for downsampling
+        )
+        # a dummy IoU prediction of all 1's under mask input
+        ious = mask_inputs.new_ones(mask_inputs.size(0), 1).float()
+        if not self.use_obj_ptrs_in_encoder:
+            # all zeros as a dummy object pointer (of shape [B, C])
+            obj_ptr = torch.zeros(
+                mask_inputs.size(0), self.hidden_dim, device=mask_inputs.device
+            )
+        else:
+            # produce an object pointer using the SAM decoder from the mask input
+            _, _, _, _, _, obj_ptr, _ = self._forward_sam_heads(
+                backbone_features=backbone_features,
+                mask_inputs=self.mask_downsample(mask_inputs_float),
+                high_res_features=high_res_features,
+            )
+        # In this method, we are treating mask_input as output, e.g. using it directly to create spatial mem;
+        # Below, we follow the same design axiom to use mask_input to decide if obj appears or not instead of relying
+        # on the object_scores from the SAM decoder.
+        is_obj_appearing = torch.any(mask_inputs.flatten(1).float() > 0.0, dim=1)
+        is_obj_appearing = is_obj_appearing[..., None]
+        lambda_is_obj_appearing = is_obj_appearing.float()
+        object_score_logits = out_scale * lambda_is_obj_appearing + out_bias
+        if self.pred_obj_scores:
+            if self.fixed_no_obj_ptr:
+                obj_ptr = lambda_is_obj_appearing * obj_ptr
+            obj_ptr = obj_ptr + (1 - lambda_is_obj_appearing) * self.no_obj_ptr
+        return (
+            low_res_masks,
+            high_res_masks,
+            ious,
+            low_res_masks,
+            high_res_masks,
+            obj_ptr,
+            object_score_logits,
+        )
+    def precompute_high_res_features(self, backbone_out):
+        if self.use_high_res_features_in_sam:
+            # precompute projected level 0 and level 1 features in SAM decoder
+            # to avoid running it again on every SAM click
+            backbone_out["backbone_fpn"][0] = self.sam_mask_decoder.conv_s0(
+                backbone_out["backbone_fpn"][0]
+            )
+            backbone_out["backbone_fpn"][1] = self.sam_mask_decoder.conv_s1(
+                backbone_out["backbone_fpn"][1]
+            )
+        return backbone_out
+    def forward_image(self, img_batch: torch.Tensor, pre_compute=True):
+        """Get the image feature on the input batch."""
+        backbone_out = self.image_encoder(img_batch)
+        return backbone_out if not pre_compute else self.precompute_high_res_features(backbone_out)
+    def _prepare_backbone_features(self, backbone_out):
+        """Prepare and flatten visual features."""
+        backbone_out = backbone_out.copy()
+        assert len(backbone_out["backbone_fpn"]) == len(backbone_out["vision_pos_enc"])
+        assert len(backbone_out["backbone_fpn"]) >= self.num_feature_levels
+        feature_maps = backbone_out["backbone_fpn"][-self.num_feature_levels :]
+        vision_pos_embeds = backbone_out["vision_pos_enc"][-self.num_feature_levels :]
+        feat_sizes = [(x.shape[-2], x.shape[-1]) for x in vision_pos_embeds]
+        # flatten NxCxHxW to HWxNxC
+        vision_feats = [x.flatten(2).permute(2, 0, 1) for x in feature_maps]
+        vision_pos_embeds = [x.flatten(2).permute(2, 0, 1) for x in vision_pos_embeds]
+        return backbone_out, vision_feats, vision_pos_embeds, feat_sizes
+    def _prepare_memory_conditioned_features(
+        self,
+        frame_idx,
+        is_init_cond_frame,
+        current_vision_feats,
+        current_vision_pos_embeds,
+        feat_sizes,
+        output_dict,
+        num_frames,
+        track_in_reverse=False,  # tracking in reverse time order (for demo usage)
+    ):
+        """Fuse the current frame's visual feature map with previous memory."""
+        B = current_vision_feats[-1].size(1)  # batch size on this frame
+        C = self.hidden_dim
+        H, W = feat_sizes[-1]  # top-level (lowest-resolution) feature size
+        device = current_vision_feats[-1].device
+        # The case of `self.num_maskmem == 0` below is primarily used for reproducing SAM on images.
+        # In this case, we skip the fusion with any memory.
+        if self.num_maskmem == 0:  # Disable memory and skip fusion
+            pix_feat = current_vision_feats[-1].permute(1, 2, 0).view(B, C, H, W)
+            return pix_feat
+        num_obj_ptr_tokens = 0
+        tpos_sign_mul = -1 if track_in_reverse else 1
+        # Step 1: condition the visual features of the current frame on previous memories
+        if not is_init_cond_frame:
+            # Retrieve the memories encoded with the maskmem backbone
+            to_cat_memory, to_cat_memory_pos_embed = [], []
+            # Add conditioning frames's output first (all cond frames have t_pos=0 for
+            # when getting temporal positional embedding below)
+            assert len(output_dict["cond_frame_outputs"]) > 0
+            # Select a maximum number of temporally closest cond frames for cross attention
+            cond_outputs = output_dict["cond_frame_outputs"]
+            selected_cond_outputs, unselected_cond_outputs = select_closest_cond_frames(
+                frame_idx, cond_outputs, self.max_cond_frames_in_attn
+            )
+            t_pos_and_prevs = [(0, out) for out in selected_cond_outputs.values()]
+            # for t_pos in range(1, min(self.num_maskmem, frame_idx)):
+            #     out = output_dict["non_cond_frame_outputs"].get(t_pos, None)
+            #     t_pos_and_prevs.append((t_pos, out))
+            # Add last (self.num_maskmem - 1) frames before current frame for non-conditioning memory
+            # the earliest one has t_pos=1 and the latest one has t_pos=self.num_maskmem-1
+            # We also allow taking the memory frame non-consecutively (with stride>1), in which case
+            # we take (self.num_maskmem - 2) frames among every stride-th frames plus the last frame.
+            stride = 1 if self.training else self.memory_temporal_stride_for_eval
+            for t_pos in range(1, self.num_maskmem):
+                t_rel = self.num_maskmem - t_pos  # how many frames before current frame
+                if t_rel == 1:
+                    # for t_rel == 1, we take the last frame (regardless of r)
+                    if not track_in_reverse:
+                        # the frame immediately before this frame (i.e. frame_idx - 1)
+                        prev_frame_idx = frame_idx - t_rel
+                    else:
+                        # the frame immediately after this frame (i.e. frame_idx + 1)
+                        prev_frame_idx = frame_idx + t_rel
+                else:
+                    # for t_rel >= 2, we take the memory frame from every r-th frames
+                    if not track_in_reverse:
+                        # first find the nearest frame among every r-th frames before this frame
+                        # for r=1, this would be (frame_idx - 2)
+                        prev_frame_idx = ((frame_idx - 2) // stride) * stride
+                        # then seek further among every r-th frames
+                        prev_frame_idx = prev_frame_idx - (t_rel - 2) * stride
+                    else:
+                        # first find the nearest frame among every r-th frames after this frame
+                        # for r=1, this would be (frame_idx + 2)
+                        prev_frame_idx = -(-(frame_idx + 2) // stride) * stride
+                        # then seek further among every r-th frames
+                        prev_frame_idx = prev_frame_idx + (t_rel - 2) * stride
+                out = output_dict["non_cond_frame_outputs"].get(prev_frame_idx, None)
+                if out is None:
+                    # If an unselected conditioning frame is among the last (self.num_maskmem - 1)
+                    # frames, we still attend to it as if it's a non-conditioning frame.
+                    out = unselected_cond_outputs.get(prev_frame_idx, None)
+                t_pos_and_prevs.append((t_pos, out))
+            for t_pos, prev in t_pos_and_prevs:
+                if prev is None:
+                    continue  # skip padding frames
+                # "maskmem_features" might have been offloaded to CPU in demo use cases,
+                # so we load it back to GPU (it's a no-op if it's already on GPU).
+                feats = prev["maskmem_features"].to(device, non_blocking=True)
+                to_cat_memory.append(feats.flatten(2).permute(2, 0, 1))
+                # Spatial positional encoding (it might have been offloaded to CPU in eval)
+                maskmem_enc = prev["maskmem_pos_enc"][-1].to(device)
+                maskmem_enc = maskmem_enc.flatten(2).permute(2, 0, 1)
+                # Temporal positional encoding
+                maskmem_enc = (
+                    maskmem_enc + self.maskmem_tpos_enc[self.num_maskmem - t_pos - 1]
+                )
+                to_cat_memory_pos_embed.append(maskmem_enc)
+            # Construct the list of past object pointers
+            if self.use_obj_ptrs_in_encoder:
+                max_obj_ptrs_in_encoder = min(num_frames, self.max_obj_ptrs_in_encoder)
+                # First add those object pointers from selected conditioning frames
+                # (optionally, only include object pointers in the past during evaluation)
+                if not self.training and self.only_obj_ptrs_in_the_past_for_eval:
+                    ptr_cond_outputs = {
+                        t: out
+                        for t, out in selected_cond_outputs.items()
+                        if (t >= frame_idx if track_in_reverse else t <= frame_idx)
+                    }
+                else:
+                    ptr_cond_outputs = selected_cond_outputs
+                pos_and_ptrs = [
+                    # Temporal pos encoding contains how far away each pointer is from current frame
+                    (
+                        (
+                            (frame_idx - t) * tpos_sign_mul
+                            if self.use_signed_tpos_enc_to_obj_ptrs
+                            else abs(frame_idx - t)
+                        ),
+                        out["obj_ptr"],
+                    )
+                    for t, out in ptr_cond_outputs.items()
+                ]
+                # Add up to (max_obj_ptrs_in_encoder - 1) non-conditioning frames before current frame
+                for t_diff in range(1, max_obj_ptrs_in_encoder):
+                    t = frame_idx + t_diff if track_in_reverse else frame_idx - t_diff
+                    if t < 0 or (num_frames is not None and t >= num_frames):
+                        break
+                    out = output_dict["non_cond_frame_outputs"].get(
+                        t, unselected_cond_outputs.get(t, None)
+                    )
+                    if out is not None:
+                        pos_and_ptrs.append((t_diff, out["obj_ptr"]))
+                # If we have at least one object pointer, add them to the across attention
+                if len(pos_and_ptrs) > 0:
+                    pos_list, ptrs_list = zip(*pos_and_ptrs)
+                    # stack object pointers along dim=0 into [ptr_seq_len, B, C] shape
+                    obj_ptrs = torch.stack(ptrs_list, dim=0)
+                    # a temporal positional embedding based on how far each object pointer is from
+                    # the current frame (sine embedding normalized by the max pointer num).
+                    # default false.
+                    if self.add_tpos_enc_to_obj_ptrs:
+                        t_diff_max = max_obj_ptrs_in_encoder - 1
+                        tpos_dim = C if self.proj_tpos_enc_in_obj_ptrs else self.mem_dim
+                        obj_pos = torch.tensor(pos_list, device=device)
+                        obj_pos = get_1d_sine_pe(obj_pos / t_diff_max, dim=tpos_dim)
+                        obj_pos = self.obj_ptr_tpos_proj(obj_pos)
+                        obj_pos = obj_pos.unsqueeze(1).expand(-1, B, self.mem_dim)
+                    else:
+                        obj_pos = obj_ptrs.new_zeros(len(pos_list), B, self.mem_dim)
+                    if self.mem_dim < C:
+                        # split a pointer into (C // self.mem_dim) tokens for self.mem_dim < C
+                        obj_ptrs = obj_ptrs.reshape(
+                            -1, B, C // self.mem_dim, self.mem_dim
+                        )
+                        obj_ptrs = obj_ptrs.permute(0, 2, 1, 3).flatten(0, 1)
+                        obj_pos = obj_pos.repeat_interleave(C // self.mem_dim, dim=0)
+                    to_cat_memory.append(obj_ptrs)
+                    to_cat_memory_pos_embed.append(obj_pos)
+                    num_obj_ptr_tokens = obj_ptrs.shape[0]
+                else:
+                    num_obj_ptr_tokens = 0
+        else:
+            # for initial conditioning frames, encode them without using any previous memory
+            if self.directly_add_no_mem_embed:
+                # directly add no-mem embedding (instead of using the transformer encoder)
+                pix_feat_with_mem = current_vision_feats[-1] + self.no_mem_embed
+                pix_feat_with_mem = pix_feat_with_mem.permute(1, 2, 0).view(B, C, H, W)
+                return pix_feat_with_mem
+            # Use a dummy token on the first frame (to avoid empty memory input to tranformer encoder)
+            # the Following lines will never be triggered.
+            raise NotImplementedError
+            to_cat_memory = [self.no_mem_embed.expand(1, B, self.mem_dim)]
+            to_cat_memory_pos_embed = [self.no_mem_pos_enc.expand(1, B, self.mem_dim)]
+        # Step 2: Concatenate the memories and forward through the transformer encoder
+        memory = torch.cat(to_cat_memory, dim=0)
+        memory_pos_embed = torch.cat(to_cat_memory_pos_embed, dim=0)
+        pix_feat_with_mem = self.memory_attention(
+            curr=current_vision_feats,
+            curr_pos=current_vision_pos_embeds,
+            memory=memory,
+            memory_pos=memory_pos_embed,
+            num_obj_ptr_tokens=num_obj_ptr_tokens,
+        )
+        # reshape the output (HW)BC => BCHW
+        pix_feat_with_mem = pix_feat_with_mem.permute(1, 2, 0).view(B, C, H, W)
+        return pix_feat_with_mem
+    def _encode_new_memory(
+        self,
+        current_vision_feats,
+        feat_sizes,
+        pred_masks_high_res,
+        object_score_logits,
+        is_mask_from_pts,
+    ):
+        """Encode the current image and its prediction into a memory feature."""
+        B = current_vision_feats[-1].size(1)  # batch size on this frame
+        C = self.hidden_dim
+        H, W = feat_sizes[-1]  # top-level (lowest-resolution) feature size
+        # top-level feature, (HW)BC => BCHW
+        pix_feat = current_vision_feats[-1].permute(1, 2, 0).view(B, C, H, W)
+        if self.non_overlap_masks_for_mem_enc and not self.training:
+            # optionally, apply non-overlapping constraints to the masks (it's applied
+            # in the batch dimension and should only be used during eval, where all
+            # the objects come from the same video under batch size 1).
+            pred_masks_high_res = self._apply_non_overlapping_constraints(
+                pred_masks_high_res
+            )
+            raise NotImplementedError
+        # scale the raw mask logits with a temperature before applying sigmoid
+        binarize = self.binarize_mask_from_pts_for_mem_enc and is_mask_from_pts
+        if binarize and not self.training:
+            mask_for_mem = (pred_masks_high_res > 0).float()
+        else:
+            # apply sigmoid on the raw mask logits to turn them into range (0, 1)
+            mask_for_mem = torch.sigmoid(pred_masks_high_res)
+        # apply scale and bias terms to the sigmoid probabilities
+        if self.sigmoid_scale_for_mem_enc != 1.0:
+            mask_for_mem = mask_for_mem * self.sigmoid_scale_for_mem_enc
+        if self.sigmoid_bias_for_mem_enc != 0.0:
+            mask_for_mem = mask_for_mem + self.sigmoid_bias_for_mem_enc
+        maskmem_out = self.memory_encoder(
+            pix_feat, mask_for_mem, skip_mask_sigmoid=True  # sigmoid already applied
+        )
+        maskmem_features = maskmem_out["vision_features"]
+        maskmem_pos_enc = maskmem_out["vision_pos_enc"]
+        # add a no-object embedding to the spatial memory to indicate that the frame
+        # is predicted to be occluded (i.e. no object is appearing in the frame)
+        if self.no_obj_embed_spatial is not None:
+            is_obj_appearing = (object_score_logits > 0).float()
+            maskmem_features += (
+                1 - is_obj_appearing[..., None, None]
+            ) * self.no_obj_embed_spatial[..., None, None].expand(
+                *maskmem_features.shape
+            )
+            # it will be used in sam2.1
+            # raise NotImplementedError
+        return maskmem_features, maskmem_pos_enc
+    def _track_step(
+        self,
+        frame_idx,
+        is_init_cond_frame,
+        current_vision_feats,
+        current_vision_pos_embeds,
+        feat_sizes,
+        point_inputs,
+        mask_inputs,
+        output_dict,
+        num_frames,
+        track_in_reverse,
+        prev_sam_mask_logits,
+    ):
+        current_out = {"point_inputs": point_inputs, "mask_inputs": mask_inputs}
+        # High-resolution feature maps for the SAM head, reshape (HW)BC => BCHW
+        if len(current_vision_feats) > 1:
+            high_res_features = [
+                x.permute(1, 2, 0).view(x.size(1), x.size(2), *s)
+                for x, s in zip(current_vision_feats[:-1], feat_sizes[:-1])
+            ]
+        else:
+            high_res_features = None
+        if mask_inputs is not None and self.use_mask_input_as_output_without_sam:
+            # When use_mask_input_as_output_without_sam=True, we directly output the mask input
+            # (see it as a GT mask) without using a SAM prompt encoder + mask decoder.
+            pix_feat = current_vision_feats[-1].permute(1, 2, 0)
+            pix_feat = pix_feat.view(-1, self.hidden_dim, *feat_sizes[-1])
+            sam_outputs = self._use_mask_as_output(
+                pix_feat, high_res_features, mask_inputs
+            )
+        else:
+            # fused the visual feature with previous memory features in the memory bank
+            pix_feat = self._prepare_memory_conditioned_features(
+                frame_idx=frame_idx,
+                is_init_cond_frame=is_init_cond_frame,
+                current_vision_feats=current_vision_feats[-1:],
+                current_vision_pos_embeds=current_vision_pos_embeds[-1:],
+                feat_sizes=feat_sizes[-1:],
+                output_dict=output_dict,
+                num_frames=num_frames,
+                track_in_reverse=track_in_reverse,
+            )
+            # apply SAM-style segmentation head
+            # here we might feed previously predicted low-res SAM mask logits into the SAM mask decoder,
+            # e.g. in demo where such logits come from earlier interaction instead of correction sampling
+            # (in this case, any `mask_inputs` shouldn't reach here as they are sent to _use_mask_as_output instead)
+            if prev_sam_mask_logits is not None:
+                assert point_inputs is not None and mask_inputs is None
+                mask_inputs = prev_sam_mask_logits
+            multimask_output = self._use_multimask(is_init_cond_frame, point_inputs)
+            sam_outputs = self._forward_sam_heads(
+                backbone_features=pix_feat,
+                point_inputs=point_inputs,
+                mask_inputs=mask_inputs,
+                high_res_features=high_res_features,
+                multimask_output=multimask_output,
+            )
+        return current_out, sam_outputs, high_res_features, pix_feat
+    def _encode_memory_in_output(
+        self,
+        current_vision_feats,
+        feat_sizes,
+        point_inputs,
+        run_mem_encoder,
+        high_res_masks,
+        object_score_logits,
+        current_out,
+    ):
+        if run_mem_encoder and self.num_maskmem > 0:
+            high_res_masks_for_mem_enc = high_res_masks
+            maskmem_features, maskmem_pos_enc = self._encode_new_memory(
+                current_vision_feats=current_vision_feats,
+                feat_sizes=feat_sizes,
+                pred_masks_high_res=high_res_masks_for_mem_enc,
+                object_score_logits=object_score_logits,
+                is_mask_from_pts=(point_inputs is not None),
+            )
+            current_out["maskmem_features"] = maskmem_features
+            current_out["maskmem_pos_enc"] = maskmem_pos_enc
+        else:
+            current_out["maskmem_features"] = None
+            current_out["maskmem_pos_enc"] = None
+    def track_step(
+        self,
+        frame_idx,
+        is_init_cond_frame,
+        current_vision_feats,
+        current_vision_pos_embeds,
+        feat_sizes,
+        point_inputs,
+        mask_inputs,
+        output_dict,
+        num_frames,
+        track_in_reverse=False,  # tracking in reverse time order (for demo usage)
+        # Whether to run the memory encoder on the predicted masks. Sometimes we might want
+        # to skip the memory encoder with `run_mem_encoder=False`. For example,
+        # in demo we might call `track_step` multiple times for each user click,
+        # and only encode the memory when the user finalizes their clicks. And in ablation
+        # settings like SAM training on static images, we don't need the memory encoder.
+        run_mem_encoder=True,
+        # The previously predicted SAM mask logits (which can be fed together with new clicks in demo).
+        prev_sam_mask_logits=None,
+    ):
+        current_out, sam_outputs, _, _ = self._track_step(
+            frame_idx,
+            is_init_cond_frame,
+            current_vision_feats,
+            current_vision_pos_embeds,
+            feat_sizes,
+            point_inputs,
+            mask_inputs,
+            output_dict,
+            num_frames,
+            track_in_reverse,
+            prev_sam_mask_logits,
+        )
+        (
+            _,
+            _,
+            _,
+            low_res_masks,
+            high_res_masks,
+            obj_ptr,
+            object_score_logits,
+        ) = sam_outputs
+        current_out["pred_masks"] = low_res_masks
+        current_out["pred_masks_high_res"] = high_res_masks
+        current_out["obj_ptr"] = obj_ptr
+        if not self.training:
+            # Only add this in inference (to avoid unused param in activation checkpointing;
+            # it's mainly used in the demo to encode spatial memories w/ consolidated masks)
+            current_out["object_score_logits"] = object_score_logits
+        # Finally run the memory encoder on the predicted mask to encode
+        # it into a new memory feature (that can be used in future frames)
+        self._encode_memory_in_output(
+            current_vision_feats,
+            feat_sizes,
+            point_inputs,
+            run_mem_encoder,
+            high_res_masks,
+            object_score_logits,
+            current_out,
+        )
+        return current_out
+    def _use_multimask(self, is_init_cond_frame, point_inputs):
+        """Whether to use multimask output in the SAM head."""
+        num_pts = 0 if point_inputs is None else point_inputs["point_labels"].size(1)
+        multimask_output = (
+            self.multimask_output_in_sam
+            and (is_init_cond_frame or self.multimask_output_for_tracking)
+            and (self.multimask_min_pt_num <= num_pts <= self.multimask_max_pt_num)
+        )
+        return multimask_output
+    def _apply_non_overlapping_constraints(self, pred_masks):
+        """
+        Apply non-overlapping constraints to the object scores in pred_masks. Here we
+        keep only the highest scoring object at each spatial location in pred_masks.
+        """
+        batch_size = pred_masks.size(0)
+        if batch_size == 1:
+            return pred_masks
+        device = pred_masks.device
+        # "max_obj_inds": object index of the object with the highest score at each location
+        max_obj_inds = torch.argmax(pred_masks, dim=0, keepdim=True)
+        # "batch_obj_inds": object index of each object slice (along dim 0) in `pred_masks`
+        batch_obj_inds = torch.arange(batch_size, device=device)[:, None, None, None]
+        keep = max_obj_inds == batch_obj_inds
+        # suppress overlapping regions' scores below -10.0 so that the foreground regions
+        # don't overlap (here sigmoid(-10.0)=4.5398e-05)
+        pred_masks = torch.where(keep, pred_masks, torch.clamp(pred_masks, max=-10.0))
+        return pred_masks

avs.code/v1m.code/model/visual/sam2/modeling/sam2_utils.py ADDED Viewed

	@@ -0,0 +1,323 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import copy
+from typing import Tuple
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from model.visual.sam2.utils.misc import mask_to_box
+def select_closest_cond_frames(frame_idx, cond_frame_outputs, max_cond_frame_num):
+    """
+    Select up to `max_cond_frame_num` conditioning frames from `cond_frame_outputs`
+    that are temporally closest to the current frame at `frame_idx`. Here, we take
+    - a) the closest conditioning frame before `frame_idx` (if any);
+    - b) the closest conditioning frame after `frame_idx` (if any);
+    - c) any other temporally closest conditioning frames until reaching a total
+         of `max_cond_frame_num` conditioning frames.
+    Outputs:
+    - selected_outputs: selected items (keys & values) from `cond_frame_outputs`.
+    - unselected_outputs: items (keys & values) not selected in `cond_frame_outputs`.
+    """
+    if max_cond_frame_num == -1 or len(cond_frame_outputs) <= max_cond_frame_num:
+        selected_outputs = cond_frame_outputs
+        unselected_outputs = {}
+    else:
+        assert max_cond_frame_num >= 2, "we should allow using 2+ conditioning frames"
+        selected_outputs = {}
+        # the closest conditioning frame before `frame_idx` (if any)
+        idx_before = max((t for t in cond_frame_outputs if t < frame_idx), default=None)
+        if idx_before is not None:
+            selected_outputs[idx_before] = cond_frame_outputs[idx_before]
+        # the closest conditioning frame after `frame_idx` (if any)
+        idx_after = min((t for t in cond_frame_outputs if t >= frame_idx), default=None)
+        if idx_after is not None:
+            selected_outputs[idx_after] = cond_frame_outputs[idx_after]
+        # add other temporally closest conditioning frames until reaching a total
+        # of `max_cond_frame_num` conditioning frames.
+        num_remain = max_cond_frame_num - len(selected_outputs)
+        inds_remain = sorted(
+            (t for t in cond_frame_outputs if t not in selected_outputs),
+            key=lambda x: abs(x - frame_idx),
+        )[:num_remain]
+        selected_outputs.update((t, cond_frame_outputs[t]) for t in inds_remain)
+        unselected_outputs = {
+            t: v for t, v in cond_frame_outputs.items() if t not in selected_outputs
+        }
+    return selected_outputs, unselected_outputs
+def get_1d_sine_pe(pos_inds, dim, temperature=10000):
+    """
+    Get 1D sine positional embedding as in the original Transformer paper.
+    """
+    pe_dim = dim // 2
+    dim_t = torch.arange(pe_dim, dtype=torch.float32, device=pos_inds.device)
+    dim_t = temperature ** (2 * (dim_t // 2) / pe_dim)
+    pos_embed = pos_inds.unsqueeze(-1) / dim_t
+    pos_embed = torch.cat([pos_embed.sin(), pos_embed.cos()], dim=-1)
+    return pos_embed
+def get_activation_fn(activation):
+    """Return an activation function given a string"""
+    if activation == "relu":
+        return F.relu
+    if activation == "gelu":
+        return F.gelu
+    if activation == "glu":
+        return F.glu
+    raise RuntimeError(f"activation should be relu/gelu, not {activation}.")
+def get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+class DropPath(nn.Module):
+    # adapted from https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/drop.py
+    def __init__(self, drop_prob=0.0, scale_by_keep=True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+    def forward(self, x):
+        if self.drop_prob == 0.0 or not self.training:
+            return x
+        keep_prob = 1 - self.drop_prob
+        shape = (x.shape[0],) + (1,) * (x.ndim - 1)
+        random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+        if keep_prob > 0.0 and self.scale_by_keep:
+            random_tensor.div_(keep_prob)
+        return x * random_tensor
+# Lightly adapted from
+# https://github.com/facebookresearch/MaskFormer/blob/main/mask_former/modeling/transformer/transformer_predictor.py # noqa
+class MLP(nn.Module):
+    def __init__(
+        self,
+        input_dim: int,
+        hidden_dim: int,
+        output_dim: int,
+        num_layers: int,
+        activation: nn.Module = nn.ReLU,
+        sigmoid_output: bool = False,
+    ) -> None:
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(
+            nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])
+        )
+        self.sigmoid_output = sigmoid_output
+        self.act = activation()
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = self.act(layer(x)) if i < self.num_layers - 1 else layer(x)
+        if self.sigmoid_output:
+            x = F.sigmoid(x)
+        return x
+# From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa
+# Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119  # noqa
+class LayerNorm2d(nn.Module):
+    def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(num_channels))
+        self.bias = nn.Parameter(torch.zeros(num_channels))
+        self.eps = eps
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        u = x.mean(1, keepdim=True)
+        s = (x - u).pow(2).mean(1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.eps)
+        x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x
+def sample_box_points(
+    masks: torch.Tensor,
+    noise: float = 0.1,  # SAM default
+    noise_bound: int = 20,  # SAM default
+    top_left_label: int = 2,
+    bottom_right_label: int = 3,
+) -> Tuple[np.array, np.array]:
+    """
+    Sample a noised version of the top left and bottom right corners of a given `bbox`
+    Inputs:
+    - masks: [B, 1, H,W] boxes, dtype=torch.Tensor
+    - noise: noise as a fraction of box width and height, dtype=float
+    - noise_bound: maximum amount of noise (in pure pixesl), dtype=int
+    Returns:
+    - box_coords: [B, num_pt, 2], contains (x, y) coordinates of top left and bottom right box corners, dtype=torch.float
+    - box_labels: [B, num_pt], label 2 is reserverd for top left and 3 for bottom right corners, dtype=torch.int32
+    """
+    device = masks.device
+    box_coords = mask_to_box(masks)
+    B, _, H, W = masks.shape
+    box_labels = torch.tensor(
+        [top_left_label, bottom_right_label], dtype=torch.int, device=device
+    ).repeat(B)
+    if noise > 0.0:
+        if not isinstance(noise_bound, torch.Tensor):
+            noise_bound = torch.tensor(noise_bound, device=device)
+        bbox_w = box_coords[..., 2] - box_coords[..., 0]
+        bbox_h = box_coords[..., 3] - box_coords[..., 1]
+        max_dx = torch.min(bbox_w * noise, noise_bound)
+        max_dy = torch.min(bbox_h * noise, noise_bound)
+        box_noise = 2 * torch.rand(B, 1, 4, device=device) - 1
+        box_noise = box_noise * torch.stack((max_dx, max_dy, max_dx, max_dy), dim=-1)
+        box_coords = box_coords + box_noise
+        img_bounds = (
+            torch.tensor([W, H, W, H], device=device) - 1
+        )  # uncentered pixel coords
+        box_coords.clamp_(torch.zeros_like(img_bounds), img_bounds)  # In place clamping
+    box_coords = box_coords.reshape(-1, 2, 2)  # always 2 points
+    box_labels = box_labels.reshape(-1, 2)
+    return box_coords, box_labels
+def sample_random_points_from_errors(gt_masks, pred_masks, num_pt=1):
+    """
+    Sample `num_pt` random points (along with their labels) independently from the error regions.
+    Inputs:
+    - gt_masks: [B, 1, H_im, W_im] masks, dtype=torch.bool
+    - pred_masks: [B, 1, H_im, W_im] masks, dtype=torch.bool or None
+    - num_pt: int, number of points to sample independently for each of the B error maps
+    Outputs:
+    - points: [B, num_pt, 2], dtype=torch.float, contains (x, y) coordinates of each sampled point
+    - labels: [B, num_pt], dtype=torch.int32, where 1 means positive clicks and 0 means
+      negative clicks
+    """
+    if pred_masks is None:  # if pred_masks is not provided, treat it as empty
+        pred_masks = torch.zeros_like(gt_masks)
+    assert gt_masks.dtype == torch.bool and gt_masks.size(1) == 1
+    assert pred_masks.dtype == torch.bool and pred_masks.shape == gt_masks.shape
+    assert num_pt >= 0
+    B, _, H_im, W_im = gt_masks.shape
+    device = gt_masks.device
+    # false positive region, a new point sampled in this region should have
+    # negative label to correct the FP error
+    fp_masks = ~gt_masks & pred_masks
+    # false negative region, a new point sampled in this region should have
+    # positive label to correct the FN error
+    fn_masks = gt_masks & ~pred_masks
+    # whether the prediction completely match the ground-truth on each mask
+    all_correct = torch.all((gt_masks == pred_masks).flatten(2), dim=2)
+    all_correct = all_correct[..., None, None]
+    # channel 0 is FP map, while channel 1 is FN map
+    pts_noise = torch.rand(B, num_pt, H_im, W_im, 2, device=device)
+    # sample a negative new click from FP region or a positive new click
+    # from FN region, depend on where the maximum falls,
+    # and in case the predictions are all correct (no FP or FN), we just
+    # sample a negative click from the background region
+    pts_noise[..., 0] *= fp_masks | (all_correct & ~gt_masks)
+    pts_noise[..., 1] *= fn_masks
+    pts_idx = pts_noise.flatten(2).argmax(dim=2)
+    labels = (pts_idx % 2).to(torch.int32)
+    pts_idx = pts_idx // 2
+    pts_x = pts_idx % W_im
+    pts_y = pts_idx // W_im
+    points = torch.stack([pts_x, pts_y], dim=2).to(torch.float)
+    return points, labels
+def sample_one_point_from_error_center(gt_masks, pred_masks, padding=True):
+    """
+    Sample 1 random point (along with its label) from the center of each error region,
+    that is, the point with the largest distance to the boundary of each error region.
+    This is the RITM sampling method from https://github.com/saic-vul/ritm_interactive_segmentation/blob/master/isegm/inference/clicker.py
+    Inputs:
+    - gt_masks: [B, 1, H_im, W_im] masks, dtype=torch.bool
+    - pred_masks: [B, 1, H_im, W_im] masks, dtype=torch.bool or None
+    - padding: if True, pad with boundary of 1 px for distance transform
+    Outputs:
+    - points: [B, 1, 2], dtype=torch.float, contains (x, y) coordinates of each sampled point
+    - labels: [B, 1], dtype=torch.int32, where 1 means positive clicks and 0 means negative clicks
+    """
+    import cv2
+    if pred_masks is None:
+        pred_masks = torch.zeros_like(gt_masks)
+    assert gt_masks.dtype == torch.bool and gt_masks.size(1) == 1
+    assert pred_masks.dtype == torch.bool and pred_masks.shape == gt_masks.shape
+    B, _, _, W_im = gt_masks.shape
+    device = gt_masks.device
+    # false positive region, a new point sampled in this region should have
+    # negative label to correct the FP error
+    fp_masks = ~gt_masks & pred_masks
+    # false negative region, a new point sampled in this region should have
+    # positive label to correct the FN error
+    fn_masks = gt_masks & ~pred_masks
+    fp_masks = fp_masks.cpu().numpy()
+    fn_masks = fn_masks.cpu().numpy()
+    points = torch.zeros(B, 1, 2, dtype=torch.float)
+    labels = torch.ones(B, 1, dtype=torch.int32)
+    for b in range(B):
+        fn_mask = fn_masks[b, 0]
+        fp_mask = fp_masks[b, 0]
+        if padding:
+            fn_mask = np.pad(fn_mask, ((1, 1), (1, 1)), "constant")
+            fp_mask = np.pad(fp_mask, ((1, 1), (1, 1)), "constant")
+        # compute the distance of each point in FN/FP region to its boundary
+        fn_mask_dt = cv2.distanceTransform(fn_mask.astype(np.uint8), cv2.DIST_L2, 0)
+        fp_mask_dt = cv2.distanceTransform(fp_mask.astype(np.uint8), cv2.DIST_L2, 0)
+        if padding:
+            fn_mask_dt = fn_mask_dt[1:-1, 1:-1]
+            fp_mask_dt = fp_mask_dt[1:-1, 1:-1]
+        # take the point in FN/FP region with the largest distance to its boundary
+        fn_mask_dt_flat = fn_mask_dt.reshape(-1)
+        fp_mask_dt_flat = fp_mask_dt.reshape(-1)
+        fn_argmax = np.argmax(fn_mask_dt_flat)
+        fp_argmax = np.argmax(fp_mask_dt_flat)
+        is_positive = fn_mask_dt_flat[fn_argmax] > fp_mask_dt_flat[fp_argmax]
+        pt_idx = fn_argmax if is_positive else fp_argmax
+        points[b, 0, 0] = pt_idx % W_im  # x
+        points[b, 0, 1] = pt_idx // W_im  # y
+        labels[b, 0] = int(is_positive)
+    points = points.to(device)
+    labels = labels.to(device)
+    return points, labels
+def get_next_point(gt_masks, pred_masks, method):
+    if method == "uniform":
+        return sample_random_points_from_errors(gt_masks, pred_masks)
+    elif method == "center":
+        return sample_one_point_from_error_center(gt_masks, pred_masks)
+    else:
+        raise ValueError(f"unknown sampling method {method}")

avs.code/v1m.code/model/visual/sam2/organised_sam2_train.py ADDED Viewed

	@@ -0,0 +1,811 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+import numpy as np
+import torch
+import torch.distributed
+from model.visual.sam2.modeling.sam2_base import SAM2Base
+from model.visual.sam2.modeling.sam2_utils import (
+    get_1d_sine_pe,
+    get_next_point,
+    sample_box_points,
+    select_closest_cond_frames,
+)
+from utils.misc import concat_points
+from utils.data_utils import BatchedVideoDatapoint
+class SAM2Train(SAM2Base):
+    def __init__(
+        self,
+        image_encoder,
+        memory_attention=None,
+        memory_encoder=None,
+        prob_to_use_pt_input_for_train=0.0,
+        prob_to_use_pt_input_for_eval=0.0,
+        prob_to_use_box_input_for_train=0.0,
+        prob_to_use_box_input_for_eval=0.0,
+        # if it is greater than 1, we interactive point sampling in the 1st frame and other randomly selected frames
+        num_frames_to_correct_for_train=1,  # default: only iteratively sample on first frame
+        num_frames_to_correct_for_eval=1,  # default: only iteratively sample on first frame
+        rand_frames_to_correct_for_train=False,
+        rand_frames_to_correct_for_eval=False,
+        # how many frames to use as initial conditioning frames (for both point input and mask input; the first frame is always used as an initial conditioning frame)
+        # - if `rand_init_cond_frames` below is True, we randomly sample 1~num_init_cond_frames initial conditioning frames
+        # - otherwise we sample a fixed number of num_init_cond_frames initial conditioning frames
+        # note: for point input, we sample correction points on all such initial conditioning frames, and we require that `num_frames_to_correct` >= `num_init_cond_frames`;
+        # these are initial conditioning frames because as we track the video, more conditioning frames might be added
+        # when a frame receives correction clicks under point input if `add_all_frames_to_correct_as_cond=True`
+        num_init_cond_frames_for_train=1,  # default: only use the first frame as initial conditioning frame
+        num_init_cond_frames_for_eval=1,  # default: only use the first frame as initial conditioning frame
+        rand_init_cond_frames_for_train=True,  # default: random 1~num_init_cond_frames_for_train cond frames (to be constent w/ previous TA data loader)
+        rand_init_cond_frames_for_eval=False,
+        # if `add_all_frames_to_correct_as_cond` is True, we also append to the conditioning frame list any frame that receives a later correction click
+        # if `add_all_frames_to_correct_as_cond` is False, we conditioning frame list to only use those initial conditioning frames
+        add_all_frames_to_correct_as_cond=False,
+        # how many additional correction points to sample (on each frame selected to be corrected)
+        # note that the first frame receives an initial input click (in addition to any correction clicks)
+        num_correction_pt_per_frame=7,
+        # method for point sampling during evaluation
+        # "uniform" (sample uniformly from error region) or "center" (use the point with the largest distance to error region boundary)
+        # default to "center" to be consistent with evaluation in the SAM paper
+        pt_sampling_for_eval="center",
+        # During training, we optionally allow sampling the correction points from GT regions
+        # instead of the prediction error regions with a small probability. This might allow the
+        # model to overfit less to the error regions in training datasets
+        prob_to_sample_from_gt_for_train=0.0,
+        use_act_ckpt_iterative_pt_sampling=False,
+        # whether to forward image features per frame (as it's being tracked) during evaluation, instead of forwarding image features
+        # of all frames at once. This avoids backbone OOM errors on very long videos in evaluation, but could be slightly slower.
+        forward_backbone_per_frame_for_eval=False,
+        freeze_image_encoder=False,
+        **kwargs,
+    ):
+        super().__init__(image_encoder, memory_attention, memory_encoder, **kwargs)
+        self.use_act_ckpt_iterative_pt_sampling = use_act_ckpt_iterative_pt_sampling
+        self.forward_backbone_per_frame_for_eval = forward_backbone_per_frame_for_eval
+        # Point sampler and conditioning frames
+        self.prob_to_use_pt_input_for_train = prob_to_use_pt_input_for_train
+        self.prob_to_use_box_input_for_train = prob_to_use_box_input_for_train
+        self.prob_to_use_pt_input_for_eval = prob_to_use_pt_input_for_eval
+        self.prob_to_use_box_input_for_eval = prob_to_use_box_input_for_eval
+        if prob_to_use_pt_input_for_train > 0 or prob_to_use_pt_input_for_eval > 0:
+            logging.info(
+                f"Training with points (sampled from masks) as inputs with p={prob_to_use_pt_input_for_train}"
+            )
+            assert num_frames_to_correct_for_train >= num_init_cond_frames_for_train
+            assert num_frames_to_correct_for_eval >= num_init_cond_frames_for_eval
+        self.num_frames_to_correct_for_train = num_frames_to_correct_for_train
+        self.num_frames_to_correct_for_eval = num_frames_to_correct_for_eval
+        self.rand_frames_to_correct_for_train = rand_frames_to_correct_for_train
+        self.rand_frames_to_correct_for_eval = rand_frames_to_correct_for_eval
+        # Initial multi-conditioning frames
+        self.num_init_cond_frames_for_train = num_init_cond_frames_for_train
+        self.num_init_cond_frames_for_eval = num_init_cond_frames_for_eval
+        self.rand_init_cond_frames_for_train = rand_init_cond_frames_for_train
+        self.rand_init_cond_frames_for_eval = rand_init_cond_frames_for_eval
+        self.add_all_frames_to_correct_as_cond = add_all_frames_to_correct_as_cond
+        self.num_correction_pt_per_frame = num_correction_pt_per_frame
+        self.pt_sampling_for_eval = pt_sampling_for_eval
+        self.prob_to_sample_from_gt_for_train = prob_to_sample_from_gt_for_train
+        # A random number generator with a fixed initial seed across GPUs
+        self.rng = np.random.default_rng(seed=42)
+        if freeze_image_encoder:
+            for p in self.image_encoder.parameters():
+                p.requires_grad = False
+    def forward(self, input: BatchedVideoDatapoint):
+        if self.training or not self.forward_backbone_per_frame_for_eval:
+            # precompute image features on all frames before tracking
+            backbone_out = self.forward_image(input.flat_img_batch)
+        else:
+            # defer image feature computation on a frame until it's being tracked
+            backbone_out = {"backbone_fpn": None, "vision_pos_enc": None}
+        backbone_out = self.prepare_prompt_inputs(backbone_out, input)
+        previous_stages_out = self.forward_tracking(backbone_out, input)
+        return previous_stages_out
+    def _prepare_backbone_features_per_frame(self, img_batch, img_ids):
+        """Compute the image backbone features on the fly for the given img_ids."""
+        # Only forward backbone on unique image ids to avoid repetitive computation
+        # (if `img_ids` has only one element, it's already unique so we skip this step).
+        if img_ids.numel() > 1:
+            unique_img_ids, inv_ids = torch.unique(img_ids, return_inverse=True)
+        else:
+            unique_img_ids, inv_ids = img_ids, None
+        # Compute the image features on those unique image ids
+        image = img_batch[unique_img_ids]
+        backbone_out = self.forward_image(image)
+        (
+            _,
+            vision_feats,
+            vision_pos_embeds,
+            feat_sizes,
+        ) = self._prepare_backbone_features(backbone_out)
+        '''
+        vision_feats
+        torch.Size([65536, 5, 32])
+        torch.Size([16384, 5, 64])
+        torch.Size([4096, 5, 256])
+        '''
+        # Inverse-map image features for `unique_img_ids` to the final image features
+        # for the original input `img_ids`.
+        if inv_ids is not None:
+            image = image[inv_ids]
+            vision_feats = [x[:, inv_ids] for x in vision_feats]
+            vision_pos_embeds = [x[:, inv_ids] for x in vision_pos_embeds]
+        return image, vision_feats, vision_pos_embeds, feat_sizes
+    @staticmethod
+    def dont_prepare_prompt_inputs(backbone_out, num_frames=5, cond_frame=0):
+        backbone_out["gt_masks_per_frame"] = {}
+        backbone_out["num_frames"] = num_frames
+        backbone_out["use_pt_input"] = False
+        # always start from the first frame.
+        backbone_out["init_cond_frames"] = [cond_frame]
+        backbone_out["frames_not_in_init_cond"] = [i for i in range(0, num_frames) if i != cond_frame]
+        # backbone_out["init_cond_frames"] = []
+        # backbone_out["frames_not_in_init_cond"] = [i for i in range(0, num_frames)]
+        backbone_out["mask_inputs_per_frame"] = {}
+        backbone_out["point_inputs_per_frame"] = {}
+        backbone_out["frames_to_add_correction_pt"] = []
+        return backbone_out
+    def prepare_prompt_inputs(self, backbone_out, input, start_frame_idx=0):
+        """
+        Prepare input mask, point or box prompts. Optionally, we allow tracking from
+        a custom `start_frame_idx` to the end of the video (for evaluation purposes).
+        """
+        # Load the ground-truth masks on all frames (so that we can later
+        # sample correction points from them)
+        # gt_masks_per_frame = {
+        #     stage_id: targets.segments.unsqueeze(1)  # [B, 1, H_im, W_im]
+        #     for stage_id, targets in enumerate(input.find_targets)
+        # }
+        gt_masks_per_frame = {
+            stage_id: masks.unsqueeze(1)  # [B, 1, H_im, W_im]
+            for stage_id, masks in enumerate(input.masks)
+        }
+        # gt_masks_per_frame = input.masks.unsqueeze(2) # [T,B,1,H_im,W_im] keep everything in tensor form
+        backbone_out["gt_masks_per_frame"] = gt_masks_per_frame
+        num_frames = input.num_frames
+        backbone_out["num_frames"] = num_frames
+        # Randomly decide whether to use point inputs or mask inputs
+        if self.training:
+            prob_to_use_pt_input = self.prob_to_use_pt_input_for_train
+            prob_to_use_box_input = self.prob_to_use_box_input_for_train
+            num_frames_to_correct = self.num_frames_to_correct_for_train
+            rand_frames_to_correct = self.rand_frames_to_correct_for_train
+            num_init_cond_frames = self.num_init_cond_frames_for_train
+            rand_init_cond_frames = self.rand_init_cond_frames_for_train
+        else:
+            prob_to_use_pt_input = self.prob_to_use_pt_input_for_eval
+            prob_to_use_box_input = self.prob_to_use_box_input_for_eval
+            num_frames_to_correct = self.num_frames_to_correct_for_eval
+            rand_frames_to_correct = self.rand_frames_to_correct_for_eval
+            num_init_cond_frames = self.num_init_cond_frames_for_eval
+            rand_init_cond_frames = self.rand_init_cond_frames_for_eval
+        if num_frames == 1:
+            # here we handle a special case for mixing video + SAM on image training,
+            # where we force using point input for the SAM task on static images
+            prob_to_use_pt_input = 1.0
+            num_frames_to_correct = 1
+            num_init_cond_frames = 1
+        assert num_init_cond_frames >= 1
+        # (here `self.rng.random()` returns value in range 0.0 <= X < 1.0)
+        use_pt_input = self.rng.random() < prob_to_use_pt_input
+        if rand_init_cond_frames and num_init_cond_frames > 1:
+            # randomly select 1 to `num_init_cond_frames` frames as initial conditioning frames
+            num_init_cond_frames = self.rng.integers(
+                1, num_init_cond_frames, endpoint=True
+            )
+        if (
+            use_pt_input
+            and rand_frames_to_correct
+            and num_frames_to_correct > num_init_cond_frames
+        ):
+            # randomly select `num_init_cond_frames` to `num_frames_to_correct` frames to sample
+            # correction clicks (only for the case of point input)
+            num_frames_to_correct = self.rng.integers(
+                num_init_cond_frames, num_frames_to_correct, endpoint=True
+            )
+        backbone_out["use_pt_input"] = use_pt_input
+        # Sample initial conditioning frames
+        if num_init_cond_frames == 1:
+            init_cond_frames = [start_frame_idx]  # starting frame
+        else:
+            # starting frame + randomly selected remaining frames (without replacement)
+            init_cond_frames = [start_frame_idx] + self.rng.choice(
+                range(start_frame_idx + 1, num_frames),
+                num_init_cond_frames - 1,
+                replace=False,
+            ).tolist()
+        backbone_out["init_cond_frames"] = init_cond_frames
+        backbone_out["frames_not_in_init_cond"] = [
+            t for t in range(start_frame_idx, num_frames) if t not in init_cond_frames
+        ]
+        # Prepare mask or point inputs on initial conditioning frames
+        backbone_out["mask_inputs_per_frame"] = {}  # {frame_idx: <input_masks>}
+        backbone_out["point_inputs_per_frame"] = {}  # {frame_idx: <input_points>}
+        for t in init_cond_frames:
+            if not use_pt_input:
+                backbone_out["mask_inputs_per_frame"][t] = gt_masks_per_frame[t]
+            else:
+                # During training # P(box) = prob_to_use_pt_input * prob_to_use_box_input
+                use_box_input = self.rng.random() < prob_to_use_box_input
+                if use_box_input:
+                    points, labels = sample_box_points(
+                        gt_masks_per_frame[t],
+                    )
+                else:
+                    # (here we only sample **one initial point** on initial conditioning frames from the
+                    # ground-truth mask; we may sample more correction points on the fly)
+                    points, labels = get_next_point(
+                        gt_masks=gt_masks_per_frame[t],
+                        pred_masks=None,
+                        method=(
+                            "uniform" if self.training else self.pt_sampling_for_eval
+                        ),
+                    )
+                point_inputs = {"point_coords": points, "point_labels": labels}
+                backbone_out["point_inputs_per_frame"][t] = point_inputs
+        # Sample frames where we will add correction clicks on the fly
+        # based on the error between prediction and ground-truth masks
+        if not use_pt_input:
+            # no correction points will be sampled when using mask inputs
+            frames_to_add_correction_pt = []
+        elif num_frames_to_correct == num_init_cond_frames:
+            frames_to_add_correction_pt = init_cond_frames
+        else:
+            assert num_frames_to_correct > num_init_cond_frames
+            # initial cond frame + randomly selected remaining frames (without replacement)
+            extra_num = num_frames_to_correct - num_init_cond_frames
+            frames_to_add_correction_pt = (
+                init_cond_frames
+                + self.rng.choice(
+                    backbone_out["frames_not_in_init_cond"], extra_num, replace=False
+                ).tolist()
+            )
+        backbone_out["frames_to_add_correction_pt"] = frames_to_add_correction_pt
+        return backbone_out
+    def forward_tracking_wo_prompt(self, backbone_out, audio_res=None, return_dict=False):
+        # img_feats_already_computed = True.
+        """Forward video tracking on each frame (and sample correction clicks)."""
+        # Prepare the backbone features
+        # - vision_feats and vision_pos_embeds are in (HW)BC format
+        (
+            _,
+            vision_feats,
+            vision_pos_embeds,
+            feat_sizes,
+        ) = self._prepare_backbone_features(backbone_out)
+        # Starting the stage loop
+        num_frames = backbone_out["num_frames"]
+        init_cond_frames = backbone_out["init_cond_frames"]
+        frames_to_add_correction_pt = backbone_out["frames_to_add_correction_pt"]
+        # first process all the initial conditioning frames to encode them as memory,
+        # and then conditioning on them to track the remaining frames
+        processing_order = init_cond_frames + backbone_out["frames_not_in_init_cond"]
+        output_dict = {
+            "cond_frame_outputs": {},  # dict containing {frame_idx: <out>}
+            "non_cond_frame_outputs": {},  # dict containing {frame_idx: <out>}
+        }
+        av_v_feats, av_a_feats = audio_res
+        for stage_id in processing_order:
+            # Get the image features for the current frames
+            img_ids = stage_id
+            # Retrieve image features according to img_ids (if they are already computed).
+            current_vision_feats = [x[:, img_ids].unsqueeze(1) for x in vision_feats] # add unsqueeze to maintain single sample.
+            current_vision_pos_embeds = [x[:, img_ids].unsqueeze(1) for x in vision_pos_embeds] # add unsqueeze to maintain single sample.
+            current_av_v_feats = [x[img_ids] for x in av_v_feats]
+            current_av_a_feats = [x[img_ids] for x in av_a_feats]
+            # Get output masks based on this frame's prompts and previous memory
+            current_out = self.track_step_wo_prompt(
+                frame_idx=stage_id,
+                is_init_cond_frame=stage_id in init_cond_frames,
+                current_vision_feats=current_vision_feats,
+                current_vision_pos_embeds=current_vision_pos_embeds,
+                feat_sizes=feat_sizes,
+                point_inputs=None, # backbone_out["point_inputs_per_frame"].get(stage_id, None),
+                mask_inputs=None, # backbone_out["mask_inputs_per_frame"].get(stage_id, None),
+                gt_masks=None, # backbone_out["gt_masks_per_frame"].get(stage_id, None),
+                frames_to_add_correction_pt=None, # frames_to_add_correction_pt,
+                output_dict=output_dict,
+                num_frames=num_frames,
+                audio_res=(current_av_v_feats, current_av_a_feats),
+            )
+            # Append the output, depending on whether it's a conditioning frame
+            add_output_as_cond_frame = stage_id in init_cond_frames or (
+                self.add_all_frames_to_correct_as_cond
+                and stage_id in frames_to_add_correction_pt
+            )
+            if add_output_as_cond_frame:
+                output_dict["cond_frame_outputs"][stage_id] = current_out
+            else:
+                output_dict["non_cond_frame_outputs"][stage_id] = current_out
+        if return_dict:
+            return output_dict
+        # turn `output_dict` into a list for loss function
+        all_frame_outputs = {}
+        all_frame_outputs.update(output_dict["cond_frame_outputs"])
+        all_frame_outputs.update(output_dict["non_cond_frame_outputs"])
+        all_frame_outputs = [all_frame_outputs[t] for t in range(num_frames)]
+        # Make DDP happy with activation checkpointing by removing unused keys
+        all_frame_outputs = [
+            {k: v for k, v in d.items() if k != "obj_ptr"} for d in all_frame_outputs
+        ]
+        return all_frame_outputs
+    def track_step_wo_prompt(
+        self,
+        frame_idx,
+        is_init_cond_frame,
+        current_vision_feats,
+        current_vision_pos_embeds,
+        feat_sizes,
+        point_inputs,
+        mask_inputs,
+        output_dict,
+        num_frames,
+        track_in_reverse=False,  # tracking in reverse time order (for demo usage)
+        run_mem_encoder=True,  # Whether to run the memory encoder on the predicted masks.
+        prev_sam_mask_logits=None,  # The previously predicted SAM mask logits.
+        frames_to_add_correction_pt=None,
+        gt_masks=None,
+        audio_res=None,
+    ):
+        if frames_to_add_correction_pt is None:
+            frames_to_add_correction_pt = []
+        current_out, sam_outputs, high_res_features, pix_feat = self._track_step_wo_prompt(
+            frame_idx,
+            is_init_cond_frame,
+            current_vision_feats,
+            current_vision_pos_embeds,
+            feat_sizes,
+            point_inputs,
+            mask_inputs,
+            output_dict,
+            num_frames,
+            track_in_reverse,
+            prev_sam_mask_logits,
+            audio_res
+        )
+        (
+            low_res_multimasks,
+            high_res_multimasks,
+            ious,
+            low_res_masks,
+            high_res_masks,
+            obj_ptr,
+            object_score_logits,
+        ) = sam_outputs
+        current_out["multistep_pred_masks"] = low_res_masks
+        current_out["multistep_pred_masks_high_res"] = high_res_masks
+        current_out["multistep_pred_multimasks"] = [low_res_multimasks]
+        current_out["multistep_pred_multimasks_high_res"] = [high_res_multimasks]
+        current_out["multistep_pred_ious"] = [ious]
+        current_out["multistep_point_inputs"] = [point_inputs]
+        current_out["multistep_object_score_logits"] = [object_score_logits]
+        '''
+        # Optionally, sample correction points iteratively to correct the mask
+        if frame_idx in frames_to_add_correction_pt:
+            point_inputs, final_sam_outputs = self._iter_correct_pt_sampling(
+                is_init_cond_frame,
+                point_inputs,
+                gt_masks,
+                high_res_features,
+                pix_feat,
+                low_res_multimasks,
+                high_res_multimasks,
+                ious,
+                low_res_masks,
+                high_res_masks,
+                object_score_logits,
+                current_out,
+            )
+            (
+                _,
+                _,
+                _,
+                low_res_masks,
+                high_res_masks,
+                obj_ptr,
+                object_score_logits,
+            ) = final_sam_outputs
+        '''
+        # Use the final prediction (after all correction steps for output and eval)
+        current_out["pred_masks"] = low_res_masks
+        current_out["pred_masks_high_res"] = high_res_masks
+        current_out["obj_ptr"] = obj_ptr
+        # Finally run the memory encoder on the predicted mask to encode
+        # it into a new memory feature (that can be used in future frames)
+        self._encode_memory_in_output(
+            current_vision_feats,
+            feat_sizes,
+            666., # point_inputs,
+            run_mem_encoder,
+            # we follow SAM2 predictor, if we have multiple masks output, we only utilise the first one to perform
+            # the memory rope attention.
+            high_res_masks,
+            object_score_logits,
+            current_out,
+        )
+        return current_out
+    def _track_step_wo_prompt(
+        self,
+        frame_idx,
+        is_init_cond_frame,
+        current_vision_feats,
+        current_vision_pos_embeds,
+        feat_sizes,
+        point_inputs,
+        mask_inputs,
+        output_dict,
+        num_frames,
+        track_in_reverse,
+        prev_sam_mask_logits,
+        audio_res=None
+    ):
+        current_out = {"point_inputs": point_inputs, "mask_inputs": mask_inputs}
+        # High-resolution feature maps for the SAM head, reshape (HW)BC => BCHW
+        if len(current_vision_feats) > 1:
+            high_res_features = [
+                x.permute(1, 2, 0).view(x.size(1), x.size(2), *s)
+                for x, s in zip(current_vision_feats[:-1], feat_sizes[:-1])
+            ]
+        else:
+            high_res_features = None
+        if mask_inputs is not None and self.use_mask_input_as_output_without_sam: # False
+            # When use_mask_input_as_output_without_sam=True, we directly output the mask input
+            # (see it as a GT mask) without using a SAM prompt encoder + mask decoder.
+            pix_feat = current_vision_feats[-1].permute(1, 2, 0)
+            pix_feat = pix_feat.view(-1, self.hidden_dim, *feat_sizes[-1])
+            sam_outputs = self._use_mask_as_output(
+                pix_feat, high_res_features, mask_inputs
+            )
+        else:
+            # fused the visual feature with previous memory features in the memory bank
+            pix_feat = self._prepare_memory_conditioned_features(
+                frame_idx=frame_idx,
+                is_init_cond_frame=is_init_cond_frame,
+                current_vision_feats=current_vision_feats[-1:],
+                current_vision_pos_embeds=current_vision_pos_embeds[-1:],
+                feat_sizes=feat_sizes[-1:],
+                output_dict=output_dict,
+                num_frames=num_frames,
+                track_in_reverse=track_in_reverse,
+            )
+            # current_vision_feats[-1] = current_vision_feats[-1] + self.no_mem_embed
+            # pix_feat = current_vision_feats[-1].permute(1, 2, 0)
+            # pix_feat = pix_feat.view(-1, self.hidden_dim, *feat_sizes[-1])
+            # we do not apply any prompts except audio.
+            '''
+            # apply SAM-style segmentation head
+            # here we might feed previously predicted low-res SAM mask logits into the SAM mask decoder,
+            # e.g. in demo where such logits come from earlier interaction instead of correction sampling
+            # (in this case, any `mask_inputs` shouldn't reach here as they are sent to _use_mask_as_output instead)
+            # if prev_sam_mask_logits is not None:
+            #     assert point_inputs is not None and mask_inputs is None
+            #     mask_inputs = prev_sam_mask_logits
+            ## comment this line, as we don't use points as prompts.
+            # multimask_output = self._use_multimask(is_init_cond_frame, point_inputs)
+            '''
+            sam_outputs = self._forward_sam_heads(
+                backbone_features=pix_feat,
+                point_inputs=point_inputs,
+                mask_inputs=mask_inputs,
+                high_res_features=high_res_features,
+                multimask_output=True,
+                audio_res=audio_res
+            )
+        return current_out, sam_outputs, high_res_features, pix_feat
+    def forward_tracking(
+        self, backbone_out, input: BatchedVideoDatapoint, return_dict=False
+    ):
+        """Forward video tracking on each frame (and sample correction clicks)."""
+        img_feats_already_computed = backbone_out["backbone_fpn"] is not None
+        if img_feats_already_computed:
+            # Prepare the backbone features
+            # - vision_feats and vision_pos_embeds are in (HW)BC format
+            (
+                _,
+                vision_feats,
+                vision_pos_embeds,
+                feat_sizes,
+            ) = self._prepare_backbone_features(backbone_out)
+        # Starting the stage loop
+        num_frames = backbone_out["num_frames"]
+        init_cond_frames = backbone_out["init_cond_frames"]
+        frames_to_add_correction_pt = backbone_out["frames_to_add_correction_pt"]
+        # first process all the initial conditioning frames to encode them as memory,
+        # and then conditioning on them to track the remaining frames
+        processing_order = init_cond_frames + backbone_out["frames_not_in_init_cond"]
+        output_dict = {
+            "cond_frame_outputs": {},  # dict containing {frame_idx: <out>}
+            "non_cond_frame_outputs": {},  # dict containing {frame_idx: <out>}
+        }
+        for stage_id in processing_order:
+            # Get the image features for the current frames
+            # img_ids = input.find_inputs[stage_id].img_ids
+            img_ids = input.flat_obj_to_img_idx[stage_id]
+            if img_feats_already_computed:
+                # Retrieve image features according to img_ids (if they are already computed).
+                current_vision_feats = [x[:, img_ids] for x in vision_feats]
+                current_vision_pos_embeds = [x[:, img_ids] for x in vision_pos_embeds]
+            else:
+                # Otherwise, compute the image features on the fly for the given img_ids
+                # (this might be used for evaluation on long videos to avoid backbone OOM).
+                (
+                    _,
+                    current_vision_feats,
+                    current_vision_pos_embeds,
+                    feat_sizes,
+                ) = self._prepare_backbone_features_per_frame(
+                    input.flat_img_batch, img_ids
+                )
+            # Get output masks based on this frame's prompts and previous memory
+            current_out = self.track_step(
+                frame_idx=stage_id,
+                is_init_cond_frame=stage_id in init_cond_frames,
+                current_vision_feats=current_vision_feats,
+                current_vision_pos_embeds=current_vision_pos_embeds,
+                feat_sizes=feat_sizes,
+                point_inputs=backbone_out["point_inputs_per_frame"].get(stage_id, None),
+                mask_inputs=backbone_out["mask_inputs_per_frame"].get(stage_id, None),
+                gt_masks=backbone_out["gt_masks_per_frame"].get(stage_id, None),
+                frames_to_add_correction_pt=frames_to_add_correction_pt,
+                output_dict=output_dict,
+                num_frames=num_frames,
+            )
+            # Append the output, depending on whether it's a conditioning frame
+            add_output_as_cond_frame = stage_id in init_cond_frames or (
+                self.add_all_frames_to_correct_as_cond
+                and stage_id in frames_to_add_correction_pt
+            )
+            if add_output_as_cond_frame:
+                output_dict["cond_frame_outputs"][stage_id] = current_out
+            else:
+                output_dict["non_cond_frame_outputs"][stage_id] = current_out
+        if return_dict:
+            return output_dict
+        # turn `output_dict` into a list for loss function
+        all_frame_outputs = {}
+        all_frame_outputs.update(output_dict["cond_frame_outputs"])
+        all_frame_outputs.update(output_dict["non_cond_frame_outputs"])
+        all_frame_outputs = [all_frame_outputs[t] for t in range(num_frames)]
+        # Make DDP happy with activation checkpointing by removing unused keys
+        all_frame_outputs = [
+            {k: v for k, v in d.items() if k != "obj_ptr"} for d in all_frame_outputs
+        ]
+        return all_frame_outputs
+    def track_step(
+        self,
+        frame_idx,
+        is_init_cond_frame,
+        current_vision_feats,
+        current_vision_pos_embeds,
+        feat_sizes,
+        point_inputs,
+        mask_inputs,
+        output_dict,
+        num_frames,
+        track_in_reverse=False,  # tracking in reverse time order (for demo usage)
+        run_mem_encoder=True,  # Whether to run the memory encoder on the predicted masks.
+        prev_sam_mask_logits=None,  # The previously predicted SAM mask logits.
+        frames_to_add_correction_pt=None,
+        gt_masks=None,
+    ):
+        if frames_to_add_correction_pt is None:
+            frames_to_add_correction_pt = []
+        current_out, sam_outputs, high_res_features, pix_feat = self._track_step(
+            frame_idx,
+            is_init_cond_frame,
+            current_vision_feats,
+            current_vision_pos_embeds,
+            feat_sizes,
+            point_inputs,
+            mask_inputs,
+            output_dict,
+            num_frames,
+            track_in_reverse,
+            prev_sam_mask_logits,
+        )
+        (
+            low_res_multimasks,
+            high_res_multimasks,
+            ious,
+            low_res_masks,
+            high_res_masks,
+            obj_ptr,
+            object_score_logits,
+        ) = sam_outputs
+        current_out["multistep_pred_masks"] = low_res_masks
+        current_out["multistep_pred_masks_high_res"] = high_res_masks
+        current_out["multistep_pred_multimasks"] = [low_res_multimasks]
+        current_out["multistep_pred_multimasks_high_res"] = [high_res_multimasks]
+        current_out["multistep_pred_ious"] = [ious]
+        current_out["multistep_point_inputs"] = [point_inputs]
+        current_out["multistep_object_score_logits"] = [object_score_logits]
+        # Optionally, sample correction points iteratively to correct the mask
+        if frame_idx in frames_to_add_correction_pt:
+            point_inputs, final_sam_outputs = self._iter_correct_pt_sampling(
+                is_init_cond_frame,
+                point_inputs,
+                gt_masks,
+                high_res_features,
+                pix_feat,
+                low_res_multimasks,
+                high_res_multimasks,
+                ious,
+                low_res_masks,
+                high_res_masks,
+                object_score_logits,
+                current_out,
+            )
+            (
+                _,
+                _,
+                _,
+                low_res_masks,
+                high_res_masks,
+                obj_ptr,
+                object_score_logits,
+            ) = final_sam_outputs
+        # Use the final prediction (after all correction steps for output and eval)
+        current_out["pred_masks"] = low_res_masks
+        current_out["pred_masks_high_res"] = high_res_masks
+        current_out["obj_ptr"] = obj_ptr
+        # Finally run the memory encoder on the predicted mask to encode
+        # it into a new memory feature (that can be used in future frames)
+        self._encode_memory_in_output(
+            current_vision_feats,
+            feat_sizes,
+            point_inputs,
+            run_mem_encoder,
+            high_res_masks,
+            object_score_logits,
+            current_out,
+        )
+        return current_out
+    def _iter_correct_pt_sampling(
+        self,
+        is_init_cond_frame,
+        point_inputs,
+        gt_masks,
+        high_res_features,
+        pix_feat_with_mem,
+        low_res_multimasks,
+        high_res_multimasks,
+        ious,
+        low_res_masks,
+        high_res_masks,
+        object_score_logits,
+        current_out,
+    ):
+        assert gt_masks is not None
+        all_pred_masks = [low_res_masks]
+        all_pred_high_res_masks = [high_res_masks]
+        all_pred_multimasks = [low_res_multimasks]
+        all_pred_high_res_multimasks = [high_res_multimasks]
+        all_pred_ious = [ious]
+        all_point_inputs = [point_inputs]
+        all_object_score_logits = [object_score_logits]
+        for _ in range(self.num_correction_pt_per_frame):
+            # sample a new point from the error between prediction and ground-truth
+            # (with a small probability, directly sample from GT masks instead of errors)
+            if self.training and self.prob_to_sample_from_gt_for_train > 0:
+                sample_from_gt = (
+                    self.rng.random() < self.prob_to_sample_from_gt_for_train
+                )
+            else:
+                sample_from_gt = False
+            # if `pred_for_new_pt` is None, only GT masks will be used for point sampling
+            pred_for_new_pt = None if sample_from_gt else (high_res_masks > 0)
+            new_points, new_labels = get_next_point(
+                gt_masks=gt_masks,
+                pred_masks=pred_for_new_pt,
+                method="uniform" if self.training else self.pt_sampling_for_eval,
+            )
+            point_inputs = concat_points(point_inputs, new_points, new_labels)
+            # Feed the mask logits of the previous SAM outputs in the next SAM decoder step.
+            # For tracking, this means that when the user adds a correction click, we also feed
+            # the tracking output mask logits along with the click as input to the SAM decoder.
+            mask_inputs = low_res_masks
+            multimask_output = self._use_multimask(is_init_cond_frame, point_inputs)
+            if self.use_act_ckpt_iterative_pt_sampling and not multimask_output:
+                sam_outputs = torch.utils.checkpoint.checkpoint(
+                    self._forward_sam_heads,
+                    backbone_features=pix_feat_with_mem,
+                    point_inputs=point_inputs,
+                    mask_inputs=mask_inputs,
+                    high_res_features=high_res_features,
+                    multimask_output=multimask_output,
+                    use_reentrant=False,
+                )
+            else:
+                sam_outputs = self._forward_sam_heads(
+                    backbone_features=pix_feat_with_mem,
+                    point_inputs=point_inputs,
+                    mask_inputs=mask_inputs,
+                    high_res_features=high_res_features,
+                    multimask_output=multimask_output,
+                )
+            (
+                low_res_multimasks,
+                high_res_multimasks,
+                ious,
+                low_res_masks,
+                high_res_masks,
+                _,
+                object_score_logits,
+            ) = sam_outputs
+            all_pred_masks.append(low_res_masks)
+            all_pred_high_res_masks.append(high_res_masks)
+            all_pred_multimasks.append(low_res_multimasks)
+            all_pred_high_res_multimasks.append(high_res_multimasks)
+            all_pred_ious.append(ious)
+            all_point_inputs.append(point_inputs)
+            all_object_score_logits.append(object_score_logits)
+        # Concatenate the masks along channel (to compute losses on all of them,
+        # using `MultiStepIteractiveMasks`)
+        current_out["multistep_pred_masks"] = torch.cat(all_pred_masks, dim=1)
+        current_out["multistep_pred_masks_high_res"] = torch.cat(
+            all_pred_high_res_masks, dim=1
+        )
+        current_out["multistep_pred_multimasks"] = all_pred_multimasks
+        current_out["multistep_pred_multimasks_high_res"] = all_pred_high_res_multimasks
+        current_out["multistep_pred_ious"] = all_pred_ious
+        current_out["multistep_point_inputs"] = all_point_inputs
+        current_out["multistep_object_score_logits"] = all_object_score_logits
+        return point_inputs, sam_outputs

avs.code/v1m.code/model/visual/sam2/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.