Bavantha11
/

m2h-mx

+dataset:
+  name: Cityscapes
+  root: data/cityscapes
+  disparity_subdir: crestereo_disparity
+  crop_bottom: 224
+  image_size: [800, 2048]
+  num_classes: 19
+  min_depth: 0.0
+  max_depth: 80.0
+  visual_min_depth: 0.0
+  visual_max_depth: 80.0
+  augment:
+    random_scale: [0.8, 1.2]
+    random_crop: true
+    horizontal_flip: true
+    color_jitter: {brightness: 0.2, contrast: 0.2, saturation: 0.2, hue: 0.1}
+    erase_prob: 0.05
+    blur_prob: 0.08
+    noise_std: 0.01
+    gamma: [0.9, 1.1]
+training:
+  epochs: 160
+  batch_size: 2
+  eval_batch_size: 4
+  num_workers: 8
+  device: cuda
+  mixed_precision: true
+  log_interval: 25
+  ckpt_interval: 1
+  grad_clip: 1.0
+  output_dir: outputs/cityscapes_m2h_mx_l
+  ema_decay: 0.999
+  eval_use_ema: false
+  use_static_graph: false
+  finetune: true  # reset scheduler/steps when resuming from ScanNet weights
+optimization:
+  lr: 7.0e-5   # lower for stability; warmup added
+  weight_decay: 0.02
+  betas: [0.9, 0.999]
+  warmup_epochs: 5
+  scheduler:
+    type: cosine
+    min_lr: 5.0e-6
+tasks:
+  include_semseg: true
+  include_depth: true
+  include_edge: false
+  include_normals: false
+  include_plane: false
+  include_confidence: false
+loss:
+  weights:
+    semseg: 2.5  # emphasize semantics
+    depth_si: 2.0 # keep depth learning strong
+  focal_for_edges: false
+  depth_scale_weight: 0.05
+  depth_coarse_weight: 0.25
+  depth_offset_weight: 0.2
+  depth_bin_weight: 0.25
+  use_uncertainty_balancer: false
+model:
+  arch: m2h_mx_l
+  num_classes: 19
+  min_depth: 0.0
+  max_depth: 80.0
+m2h_mx:
+  decoder_dim: 256
+  num_seg_classes: 19
+  backbone_lr_scale: 0.03   # lower backbone LR when all blocks are unfrozen
+  ltc_window_size: 4
+  hm_d_state: 32
+  hm_drop_path: 0.1
+  gtf_extra_levels: 2
+  train_last_n_blocks: 24  # DINOv3-L has 24 blocks; unfreeze all for fine-tuning
+  intermediate_layer_indices: [5, 11, 17, 23]
+  num_register_tokens: 4
+  use_lora: true
+  lora_rank: 16
+  lora_alpha: 32.0
+  lora_dropout: 0.05
+  backbone_name: facebook/dinov3-vitl16-pretrain-lvd1689m
+  depth_bins: 64
+  depth_aux_weight: 0.2
+  aux_weights:
+    semseg: 0.5
+    depth: 0.2
+validation:
+  interval_steps: 100
+  save_best_on: ["sem_mIoU", "dep_AbsRel"]