# Cocoa pod 5-class — EVA-02-Large fine-tuning on MI300X
# Backbone:  EVA-02-L ViT-L/14 pretrained MIM-CLIP, fine-tuned IN22k+IN1k @ 448
# Dataset :  merged LatAm + Peru YOLO crops (~4.9k crops, 5 classes)
#
# Resolution is 448 — quadruples per-image memory vs Track 2's 224×224 DINOv2,
# so batch_size and num_workers are scaled down. Saturate later if VRAM allows.

seed: 123

data:
  # splits.json built by prepare_cocoa_data.py — paths inside it are absolute
  splits_file: splits.json
  num_workers: 4
  pin_memory: true

model:
  # timm name for EVA-02-Large pretrained MIM (LAION) + IN22k + IN1k FT @ 448
  name: eva02_large_patch14_448.mim_m38m_ft_in22k_in1k
  num_classes: 5
  drop_path_rate: 0.1
  img_size: 448

train:
  # Single-phase fine-tune (matches the simplified track2/train.py behavior)
  lr: 1.0e-4
  epochs: 20             # ~5k crops × 20 epochs ≈ 100k steps total at BS=64
  batch_size: 64         # safe at 448px on a 192GB MI300X; raise if you have headroom
  grad_accum_steps: 1
  weight_decay: 0.05
  betas: [0.9, 0.999]
  label_smoothing: 0.1
  mixup_alpha: 0.1       # matches the reduced setting that worked in Track 2
  cutmix_alpha: 0.5
  mixup_prob: 0.5
  grad_clip: 1.0
  amp_dtype: bfloat16    # MI300X has native bf16
  compile: false         # enable after the first stable run if you want speed
  grad_checkpointing: true   # 448px attn maps are big; checkpoint to free VRAM

augment:
  rand_augment_n: 2
  rand_augment_m: 9
  random_erasing_p: 0.25
  horizontal_flip: true
  vertical_flip: true       # cocoa pods don't have a strong up/down orientation

eval:
  batch_size: 64
  tta_rounds: 10

log:
  every_n_steps: 50
  checkpoint_dir: runs
  save_best_metric: val_macro_f1   # rare classes (witches_broom, carmenta) → optimize macro F1