Bavantha11
/

m2h-mx

+dataset:
+  name: NYUD
+  root: data/NYUDv2
+  image_size: [480, 640]
+  num_classes: 40
+  min_depth: 0.01
+  max_depth: 10.0
+  visual_min_depth: 0.01
+  visual_max_depth: 10.0
+  augment:
+    random_scale: [1.0, 1.2]
+    random_crop: true
+    horizontal_flip: true
+    color_jitter: {brightness: 0.2, contrast: 0.2, saturation: 0.2, hue: 0.2}
+    erase_prob: 0.0
+training:
+  epochs: 120
+  batch_size: 6
+  eval_batch_size: 12
+  num_workers: 4
+  device: cuda
+  mixed_precision: true
+  log_interval: 50
+  ckpt_interval: 1
+  grad_clip: 1.0
+  output_dir: outputs/nyudv2_m2h_mx_b
+  ema_decay: 0.999
+  eval_use_ema: false
+  finetune: true
+optimization:
+  lr: 5.0e-5
+  weight_decay: 0.02
+  betas: [0.9, 0.999]
+  warmup_epochs: 0
+  scheduler:
+    type: cosine
+    min_lr: 1.0e-5
+tasks:
+  include_semseg: true
+  include_depth: true
+  include_edge: true
+  include_normals: true
+  edge_pos_weight: 1.0
+loss:
+  weights:
+    semseg: 2.0
+    depth_si: 2.0
+    edge: 0.5
+    normals: 1.0
+  focal_for_edges: true
+  geom_consistency_weight: 0.2
+  consistency_depth_normals: 0.2
+  consistency_sem_edge: 0.25
+  depth_scale_weight: 0.2
+  depth_coarse_weight: 0.2
+  depth_offset_weight: 0.15
+  depth_bin_weight: 0.3
+  edge_dice_weight: 0.1
+  use_uncertainty_balancer: false
+model:
+  arch: m2h_mx_b
+  num_classes: 40
+  min_depth: 0.01
+  max_depth: 10.0
+m2h_mx:
+  decoder_dim: 256
+  num_seg_classes: 40
+  backbone_lr_scale: 0.2
+  ltc_window_size: 4
+  hm_d_state: 32
+  hm_drop_path: 0.1
+  gtf_extra_levels: 2
+  train_last_n_blocks: 4
+  intermediate_layer_indices: [2, 5, 8, 11]
+  depth_aux_weight: 0.5
+  num_register_tokens: 4
+  use_lora: true
+  lora_rank: 16
+  lora_alpha: 32.0
+  lora_dropout: 0.05
+  backbone_name: facebook/dinov3-vitb16-pretrain-lvd1689m
+  depth_bins: 64
+  aux_weights:
+    semseg: 0.3
+    depth: 0.3
+    edge: 0.15
+    normals: 0.15
+validation:
+  interval_steps: 100
+  save_best_on: ["sem_mIoU", "dep_AbsRel"]