| scratch: |
| resolution: 1024 |
| train_batch_size: 1 |
| num_train_workers: 3 |
| num_frames: 8 |
| max_num_objects: 4 |
| base_lr: 5.0e-06 |
| vision_lr: 3.0e-06 |
| phases_per_epoch: 1 |
| num_epochs: 40 |
| dataset: |
| img_folder: /home/hossein/hossein/projects/sam2/training/ptmc-data/JPEGImages |
| gt_folder: /home/hossein/hossein/projects/sam2/training/ptmc-data/Annotations |
| file_list_txt: null |
| multiplier: 2 |
| vos: |
| train_transforms: |
| - _target_: training.dataset.transforms.ComposeAPI |
| transforms: |
| - _target_: training.dataset.transforms.RandomHorizontalFlip |
| consistent_transform: true |
| - _target_: training.dataset.transforms.RandomAffine |
| degrees: 25 |
| shear: 20 |
| image_interpolation: bilinear |
| consistent_transform: true |
| - _target_: training.dataset.transforms.RandomResizeAPI |
| sizes: 1024 |
| square: true |
| consistent_transform: true |
| - _target_: training.dataset.transforms.ColorJitter |
| consistent_transform: true |
| brightness: 0.1 |
| contrast: 0.03 |
| saturation: 0.03 |
| hue: null |
| - _target_: training.dataset.transforms.RandomGrayscale |
| p: 0.05 |
| consistent_transform: true |
| - _target_: training.dataset.transforms.ColorJitter |
| consistent_transform: false |
| brightness: 0.1 |
| contrast: 0.05 |
| saturation: 0.05 |
| hue: null |
| - _target_: training.dataset.transforms.ToTensorAPI |
| - _target_: training.dataset.transforms.NormalizeAPI |
| mean: |
| - 0.485 |
| - 0.456 |
| - 0.406 |
| std: |
| - 0.229 |
| - 0.224 |
| - 0.225 |
| trainer: |
| _target_: training.trainer.Trainer |
| mode: train_only |
| max_epochs: 40 |
| accelerator: cuda |
| seed_value: 123 |
| model: |
| _target_: training.model.sam2.SAM2Train |
| image_encoder: |
| _target_: sam2.modeling.backbones.image_encoder.ImageEncoder |
| scalp: 1 |
| trunk: |
| _target_: sam2.modeling.backbones.hieradet.Hiera |
| embed_dim: 144 |
| num_heads: 2 |
| stages: |
| - 2 |
| - 6 |
| - 36 |
| - 4 |
| global_att_blocks: |
| - 23 |
| - 33 |
| - 43 |
| window_pos_embed_bkg_spatial_size: |
| - 7 |
| - 7 |
| window_spec: |
| - 8 |
| - 4 |
| - 16 |
| - 8 |
| neck: |
| _target_: sam2.modeling.backbones.image_encoder.FpnNeck |
| position_encoding: |
| _target_: sam2.modeling.position_encoding.PositionEmbeddingSine |
| num_pos_feats: 256 |
| normalize: true |
| scale: null |
| temperature: 10000 |
| d_model: 256 |
| backbone_channel_list: |
| - 1152 |
| - 576 |
| - 288 |
| - 144 |
| fpn_top_down_levels: |
| - 2 |
| - 3 |
| fpn_interp_model: nearest |
| memory_attention: |
| _target_: sam2.modeling.memory_attention.MemoryAttention |
| d_model: 256 |
| pos_enc_at_input: true |
| layer: |
| _target_: sam2.modeling.memory_attention.MemoryAttentionLayer |
| activation: relu |
| dim_feedforward: 2048 |
| dropout: 0.1 |
| pos_enc_at_attn: false |
| self_attention: |
| _target_: sam2.modeling.sam.transformer.RoPEAttention |
| rope_theta: 10000.0 |
| feat_sizes: |
| - 64 |
| - 64 |
| embedding_dim: 256 |
| num_heads: 1 |
| downsample_rate: 1 |
| dropout: 0.1 |
| d_model: 256 |
| pos_enc_at_cross_attn_keys: true |
| pos_enc_at_cross_attn_queries: false |
| cross_attention: |
| _target_: sam2.modeling.sam.transformer.RoPEAttention |
| rope_theta: 10000.0 |
| feat_sizes: |
| - 64 |
| - 64 |
| rope_k_repeat: true |
| embedding_dim: 256 |
| num_heads: 1 |
| downsample_rate: 1 |
| dropout: 0.1 |
| kv_in_dim: 64 |
| num_layers: 4 |
| memory_encoder: |
| _target_: sam2.modeling.memory_encoder.MemoryEncoder |
| out_dim: 64 |
| position_encoding: |
| _target_: sam2.modeling.position_encoding.PositionEmbeddingSine |
| num_pos_feats: 64 |
| normalize: true |
| scale: null |
| temperature: 10000 |
| mask_downsampler: |
| _target_: sam2.modeling.memory_encoder.MaskDownSampler |
| kernel_size: 3 |
| stride: 2 |
| padding: 1 |
| fuser: |
| _target_: sam2.modeling.memory_encoder.Fuser |
| layer: |
| _target_: sam2.modeling.memory_encoder.CXBlock |
| dim: 256 |
| kernel_size: 7 |
| padding: 3 |
| layer_scale_init_value: 1.0e-06 |
| use_dwconv: true |
| num_layers: 2 |
| num_maskmem: 7 |
| image_size: 1024 |
| sigmoid_scale_for_mem_enc: 20.0 |
| sigmoid_bias_for_mem_enc: -10.0 |
| use_mask_input_as_output_without_sam: true |
| directly_add_no_mem_embed: true |
| no_obj_embed_spatial: true |
| use_high_res_features_in_sam: true |
| multimask_output_in_sam: true |
| iou_prediction_use_sigmoid: true |
| use_obj_ptrs_in_encoder: true |
| add_tpos_enc_to_obj_ptrs: true |
| proj_tpos_enc_in_obj_ptrs: true |
| use_signed_tpos_enc_to_obj_ptrs: true |
| only_obj_ptrs_in_the_past_for_eval: true |
| pred_obj_scores: true |
| pred_obj_scores_mlp: true |
| fixed_no_obj_ptr: true |
| multimask_output_for_tracking: true |
| use_multimask_token_for_obj_ptr: true |
| multimask_min_pt_num: 0 |
| multimask_max_pt_num: 1 |
| use_mlp_for_obj_ptr_proj: true |
| compile_image_encoder: false |
| prob_to_use_pt_input_for_train: 0.5 |
| prob_to_use_pt_input_for_eval: 0.0 |
| prob_to_use_box_input_for_train: 0.5 |
| prob_to_use_box_input_for_eval: 0.0 |
| prob_to_sample_from_gt_for_train: 0.1 |
| num_frames_to_correct_for_train: 2 |
| num_frames_to_correct_for_eval: 1 |
| rand_frames_to_correct_for_train: true |
| add_all_frames_to_correct_as_cond: true |
| num_init_cond_frames_for_train: 2 |
| rand_init_cond_frames_for_train: true |
| num_correction_pt_per_frame: 7 |
| use_act_ckpt_iterative_pt_sampling: false |
| num_init_cond_frames_for_eval: 1 |
| forward_backbone_per_frame_for_eval: true |
| data: |
| train: |
| _target_: training.dataset.sam2_datasets.TorchTrainMixedDataset |
| phases_per_epoch: 1 |
| batch_sizes: |
| - 1 |
| datasets: |
| - _target_: training.dataset.utils.RepeatFactorWrapper |
| dataset: |
| _target_: training.dataset.utils.ConcatDataset |
| datasets: |
| - _target_: training.dataset.vos_dataset.VOSDataset |
| transforms: |
| - _target_: training.dataset.transforms.ComposeAPI |
| transforms: |
| - _target_: training.dataset.transforms.RandomHorizontalFlip |
| consistent_transform: true |
| - _target_: training.dataset.transforms.RandomAffine |
| degrees: 25 |
| shear: 20 |
| image_interpolation: bilinear |
| consistent_transform: true |
| - _target_: training.dataset.transforms.RandomResizeAPI |
| sizes: 1024 |
| square: true |
| consistent_transform: true |
| - _target_: training.dataset.transforms.ColorJitter |
| consistent_transform: true |
| brightness: 0.1 |
| contrast: 0.03 |
| saturation: 0.03 |
| hue: null |
| - _target_: training.dataset.transforms.RandomGrayscale |
| p: 0.05 |
| consistent_transform: true |
| - _target_: training.dataset.transforms.ColorJitter |
| consistent_transform: false |
| brightness: 0.1 |
| contrast: 0.05 |
| saturation: 0.05 |
| hue: null |
| - _target_: training.dataset.transforms.ToTensorAPI |
| - _target_: training.dataset.transforms.NormalizeAPI |
| mean: |
| - 0.485 |
| - 0.456 |
| - 0.406 |
| std: |
| - 0.229 |
| - 0.224 |
| - 0.225 |
| training: true |
| video_dataset: |
| _target_: training.dataset.vos_raw_dataset.PNGRawDataset |
| img_folder: /home/hossein/hossein/projects/sam2/training/ptmc-data/JPEGImages |
| gt_folder: /home/hossein/hossein/projects/sam2/training/ptmc-data/Annotations |
| file_list_txt: null |
| sampler: |
| _target_: training.dataset.vos_sampler.RandomUniformSampler |
| num_frames: 8 |
| max_num_objects: 4 |
| multiplier: 2 |
| shuffle: true |
| num_workers: 3 |
| pin_memory: true |
| drop_last: true |
| collate_fn: |
| _target_: training.utils.data_utils.collate_fn |
| _partial_: true |
| dict_key: all |
| optim: |
| amp: |
| enabled: true |
| amp_dtype: bfloat16 |
| optimizer: |
| _target_: torch.optim.AdamW |
| gradient_clip: |
| _target_: training.optimizer.GradientClipper |
| max_norm: 0.1 |
| norm_type: 2 |
| param_group_modifiers: |
| - _target_: training.optimizer.layer_decay_param_modifier |
| _partial_: true |
| layer_decay_value: 0.9 |
| apply_to: image_encoder.trunk |
| overrides: |
| - pattern: '*pos_embed*' |
| value: 1.0 |
| options: |
| lr: |
| - scheduler: |
| _target_: fvcore.common.param_scheduler.CosineParamScheduler |
| start_value: 5.0e-06 |
| end_value: 5.000000000000001e-07 |
| - scheduler: |
| _target_: fvcore.common.param_scheduler.CosineParamScheduler |
| start_value: 3.0e-06 |
| end_value: 3.0e-07 |
| param_names: |
| - image_encoder.* |
| weight_decay: |
| - scheduler: |
| _target_: fvcore.common.param_scheduler.ConstantParamScheduler |
| value: 0.1 |
| - scheduler: |
| _target_: fvcore.common.param_scheduler.ConstantParamScheduler |
| value: 0.0 |
| param_names: |
| - '*bias*' |
| module_cls_names: |
| - torch.nn.LayerNorm |
| loss: |
| all: |
| _target_: training.loss_fns.MultiStepMultiMasksAndIous |
| weight_dict: |
| loss_mask: 20 |
| loss_dice: 1 |
| loss_iou: 1 |
| loss_class: 1 |
| supervise_all_iou: true |
| iou_use_l1_loss: true |
| pred_obj_scores: true |
| focal_gamma_obj_score: 0.0 |
| focal_alpha_obj_score: -1.0 |
| distributed: |
| backend: nccl |
| find_unused_parameters: true |
| logging: |
| tensorboard_writer: |
| _target_: training.utils.logger.make_tensorboard_logger |
| log_dir: /ephemeral/hossein/output/sam2/tensorboard |
| flush_secs: 120 |
| should_log: true |
| log_dir: /ephemeral/hossein/output/sam2/logs |
| log_freq: 10 |
| checkpoint: |
| save_dir: /ephemeral/hossein/output/sam2/checkpoints |
| save_freq: 1 |
| model_weight_initializer: |
| _partial_: true |
| _target_: training.utils.checkpoint_utils.load_state_dict_into_model |
| strict: true |
| ignore_unexpected_keys: null |
| ignore_missing_keys: null |
| state_dict: |
| _target_: training.utils.checkpoint_utils.load_checkpoint_and_apply_kernels |
| checkpoint_path: /home/hossein/hossein/projects/sam2/checkpoints/sam2.1_hiera_large.pt |
| ckpt_state_dict_keys: |
| - model |
| launcher: |
| num_nodes: 1 |
| gpus_per_node: 4 |
| experiment_log_dir: /ephemeral/hossein/output/sam2 |
| submitit: |
| partition: null |
| account: null |
| qos: null |
| cpus_per_task: 10 |
| use_cluster: false |
| timeout_hour: 24 |
| name: null |
| port_range: |
| - 10000 |
| - 65000 |
|
|