Bavantha11
/

m2h-mx

+dataset:
+  name: ScanNet
+  root: data/scannet
+  image_size: [480, 640]
+  num_classes: 40  # keep head compatible with NYUDv2 fine-tuning
+  min_depth: 0.1
+  max_depth: 10.0
+  visual_min_depth: 0.1
+  visual_max_depth: 10.0
+  augment:
+    random_scale: [0.9, 1.15]
+    random_crop: true
+    horizontal_flip: true
+    color_jitter: {brightness: 0.25, contrast: 0.25, saturation: 0.2, hue: 0.1}
+    erase_prob: 0.15
+training:
+  epochs: 160
+  batch_size: 8
+  eval_batch_size: 12
+  num_workers: 8
+  device: cuda
+  mixed_precision: true
+  log_interval: 200
+  ckpt_interval: 1
+  grad_clip: 1.0
+  output_dir: outputs/scannet_m2h_mx_b
+  ema_decay: 0.999
+  eval_use_ema: false
+optimization:
+  lr: 1.0e-4
+  weight_decay: 0.05
+  betas: [0.9, 0.999]
+  warmup_epochs: 0
+  scheduler:
+    type: onecycle
+    max_lr_factor: 3.0
+    pct_start: 0.1
+    div_factor: 5.0
+    final_div_factor: 25.0
+tasks:
+  include_semseg: true
+  include_depth: true
+  include_edge: false
+  include_normals: false
+  include_plane: false
+  include_confidence: false
+loss:
+  weights:
+    semseg: 2.0
+    depth_si: 3.0
+  focal_for_edges: false
+  depth_scale_weight: 0.1
+  depth_coarse_weight: 0.3
+  depth_offset_weight: 0.15
+  depth_bin_weight: 0.3
+  use_uncertainty_balancer: false
+model:
+  arch: m2h_mx_b
+  num_classes: 40
+  min_depth: 0.1
+  max_depth: 10.0
+m2h_mx:
+  decoder_dim: 256
+  num_seg_classes: 40
+  backbone_lr_scale: 0.05
+  ltc_window_size: 4
+  hm_d_state: 32
+  hm_drop_path: 0.1
+  gtf_extra_levels: 2
+  train_last_n_blocks: 2
+  intermediate_layer_indices: [2, 5, 8, 11]
+  num_register_tokens: 4
+  use_lora: true
+  lora_rank: 16
+  lora_alpha: 32.0
+  lora_dropout: 0.05
+  backbone_name: facebook/dinov3-vitb16-pretrain-lvd1689m
+  depth_bins: 64
+  depth_aux_weight: 0.4
+  aux_weights:
+    semseg: 0.4
+    depth: 0.4
+validation:
+  interval_steps: 1000
+  save_best_on: ["sem_mIoU", "dep_AbsRel"]