omar-ah
/

vil-tracker

Model card Files Files and versions

xet

Community

omar-ah commited on 9 days ago

Commit

64c9f85

verified ·

1 Parent(s): 69641c6

Fix vil_tracker/training/train.py: audit corrections

Browse files

Files changed (1) hide show

vil_tracker/training/train.py +247 -35

vil_tracker/training/train.py CHANGED Viewed

@@ -4,13 +4,15 @@ Training script for ViL Tracker.
 Two-phase training:
 Phase 1: Standard supervised training on GOT-10k + LaSOT + TrackingNet
   - Full model training with focal + GIoU + size losses
-  - ACL curriculum (progressive difficulty ramp-up)
   - 300 epochs, lr=1e-4 with cosine decay, warmup=5 epochs
 Phase 2: Fine-tuning with TMoE and distillation
   - Freeze shared experts in TMoE blocks
   - Add contrastive loss on temporal features
-  - Optional AFKD distillation from MCITrack teacher
   - 100 epochs, lr=1e-5
 Hardware: Designed for A10G (24GB) or A100 (80GB)
@@ -27,9 +29,18 @@ from torch.cuda.amp import autocast, GradScaler
 def build_optimizer(model, lr=1e-4, weight_decay=0.05, backbone_lr_scale=0.1):
-    """Build AdamW optimizer with layer-wise learning rate decay."""
     backbone_params = []
     head_params = []
     other_params = []
     for name, param in model.named_parameters():
@@ -39,36 +50,72 @@ def build_optimizer(model, lr=1e-4, weight_decay=0.05, backbone_lr_scale=0.1):
             backbone_params.append(param)
         elif 'center_head' in name or 'uncertainty_head' in name:
             head_params.append(param)
         else:
             other_params.append(param)
     param_groups = [
-        {'params': backbone_params, 'lr': lr * backbone_lr_scale},
-        {'params': head_params, 'lr': lr},
-        {'params': other_params, 'lr': lr * 0.5},
     ]
     return optim.AdamW(param_groups, lr=lr, weight_decay=weight_decay, betas=(0.9, 0.999))
 def build_scheduler(optimizer, total_epochs, warmup_epochs=5):
     """Cosine annealing with linear warmup."""
     def lr_lambda(epoch):
         if epoch < warmup_epochs:
-            return epoch / warmup_epochs
-        progress = (epoch - warmup_epochs) / (total_epochs - warmup_epochs)
         return 0.5 * (1 + math.cos(math.pi * progress))
     return optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
 def train_one_epoch(
-    model, dataloader, optimizer, scheduler, scaler, loss_fn, device,
     epoch, total_epochs, acl_lambda=None, grad_clip=1.0,
 ):
-    """Train for one epoch with AMP and gradient clipping."""
     model.train()
     total_loss = 0
     num_batches = 0
     for batch_idx, batch in enumerate(dataloader):
@@ -79,12 +126,24 @@ def train_one_epoch(
         gt_boxes = batch['boxes'].to(device)
         optimizer.zero_grad()
         with autocast(enabled=scaler is not None):
-            pred = model(template, search, use_temporal=False)
             loss_dict = loss_fn(pred, gt_heatmap, gt_size, gt_boxes)
             loss = loss_dict['total']
             # ACL difficulty weighting
             if acl_lambda is not None:
                 loss = loss * acl_lambda
@@ -94,24 +153,41 @@ def train_one_epoch(
             scaler.unscale_(optimizer)
             nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
             scaler.step(optimizer)
             scaler.update()
         else:
             loss.backward()
             nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
             optimizer.step()
         total_loss += loss.item()
         num_batches += 1
         if batch_idx % 100 == 0:
-            print(f"  Epoch {epoch}/{total_epochs} | Batch {batch_idx} | "
-                  f"Loss: {loss.item():.4f} | "
-                  f"Heatmap: {loss_dict['heatmap']:.4f} | "
-                  f"GIoU: {loss_dict['giou']:.4f} | "
-                  f"Size: {loss_dict['size']:.4f}")
-    avg_loss = total_loss / max(num_batches, 1)
-    return avg_loss
 def train_phase1(
@@ -119,7 +195,17 @@ def train_phase1(
     num_epochs=300, lr=1e-4, batch_size=32, num_workers=4,
     save_dir='./checkpoints', push_to_hub=False, hub_model_id=None,
 ):
-    """Phase 1: Standard supervised training."""
     print(f"=== Phase 1 Training: {num_epochs} epochs ===")
     os.makedirs(save_dir, exist_ok=True)
@@ -129,6 +215,7 @@ def train_phase1(
     model = model.to(device)
     optimizer = build_optimizer(model, lr=lr)
     scheduler = build_scheduler(optimizer, num_epochs)
     scaler = GradScaler() if device == 'cuda' else None
@@ -140,27 +227,51 @@ def train_phase1(
     best_loss = float('inf')
     for epoch in range(num_epochs):
-        # ACL curriculum: linear ramp-up of difficulty
-        acl_lambda = min(1.0, (epoch + 1) / 50)  # Ramp up over 50 epochs
-        avg_loss = train_one_epoch(
-            model, dataloader, optimizer, scheduler, scaler, loss_fn,
             device, epoch, num_epochs, acl_lambda=acl_lambda,
         )
         scheduler.step()
-        print(f"Epoch {epoch}/{num_epochs} | Avg Loss: {avg_loss:.4f} | "
-              f"LR: {scheduler.get_last_lr()[0]:.6f} | ACL λ: {acl_lambda:.2f}")
         # Save best
-        if avg_loss < best_loss:
-            best_loss = avg_loss
             torch.save({
                 'epoch': epoch,
                 'model_state_dict': model.state_dict(),
                 'optimizer_state_dict': optimizer.state_dict(),
                 'loss': best_loss,
             }, os.path.join(save_dir, 'best_phase1.pth'))
         # Save periodic
@@ -169,7 +280,8 @@ def train_phase1(
                 'epoch': epoch,
                 'model_state_dict': model.state_dict(),
                 'optimizer_state_dict': optimizer.state_dict(),
-                'loss': avg_loss,
             }, os.path.join(save_dir, f'phase1_epoch{epoch+1}.pth'))
     if push_to_hub and hub_model_id:
@@ -182,18 +294,44 @@ def train_phase2(
     model, train_dataset, config, device='cuda',
     num_epochs=100, lr=1e-5, batch_size=32, num_workers=4,
     save_dir='./checkpoints', push_to_hub=False, hub_model_id=None,
 ):
-    """Phase 2: Fine-tuning with frozen shared experts."""
     print(f"=== Phase 2 Training: {num_epochs} epochs ===")
-    # Freeze shared experts
     model.freeze_backbone_shared_experts()
-    from .losses import CombinedTrackingLoss
     loss_fn = CombinedTrackingLoss(use_uncertainty=True, use_adw=True).to(device)
     model = model.to(device)
     optimizer = build_optimizer(model, lr=lr, backbone_lr_scale=0.01)
     scheduler = build_scheduler(optimizer, num_epochs, warmup_epochs=2)
     scaler = GradScaler() if device == 'cuda' else None
@@ -205,13 +343,78 @@ def train_phase2(
     best_loss = float('inf')
     for epoch in range(num_epochs):
-        avg_loss = train_one_epoch(
-            model, dataloader, optimizer, scheduler, scaler, loss_fn,
-            device, epoch, num_epochs,
-        )
         scheduler.step()
         print(f"Phase2 Epoch {epoch}/{num_epochs} | Avg Loss: {avg_loss:.4f} | "
               f"LR: {scheduler.get_last_lr()[0]:.6f}")
@@ -221,7 +424,16 @@ def train_phase2(
                 'epoch': epoch,
                 'model_state_dict': model.state_dict(),
                 'loss': best_loss,
             }, os.path.join(save_dir, 'best_phase2.pth'))
     if push_to_hub and hub_model_id:
         _push_checkpoint_to_hub(model, save_dir, hub_model_id, 'phase2')

 Two-phase training:
 Phase 1: Standard supervised training on GOT-10k + LaSOT + TrackingNet
   - Full model training with focal + GIoU + size losses
+  - ACL curriculum (progressive difficulty ramp-up on dataset AND loss weighting)
+  - FiLM temporal modulation trained with temporal pairs
   - 300 epochs, lr=1e-4 with cosine decay, warmup=5 epochs
 Phase 2: Fine-tuning with TMoE and distillation
   - Freeze shared experts in TMoE blocks
   - Add contrastive loss on temporal features
+  - Optional AFKD distillation from MCITrack-B256 teacher
+  - FiLM temporal modulation active for all samples
   - 100 epochs, lr=1e-5
 Hardware: Designed for A10G (24GB) or A100 (80GB)
 def build_optimizer(model, lr=1e-4, weight_decay=0.05, backbone_lr_scale=0.1):
+    """Build AdamW optimizer with component-wise learning rate scaling.
+    Groups:
+    - backbone: lr * backbone_lr_scale (pretrained or dominant, train slower)
+    - heads: full lr (task-specific, need fast adaptation)
+    - temporal_mod: lr * 0.5 (FiLM modulation, moderate learning)
+    - loss params (ADW): lr * 0.1 (loss weighting, very slow adaptation)
+    """
     backbone_params = []
     head_params = []
+    temporal_params = []
+    loss_params = []
     other_params = []
     for name, param in model.named_parameters():
             backbone_params.append(param)
         elif 'center_head' in name or 'uncertainty_head' in name:
             head_params.append(param)
+        elif 'temporal_mod' in name:
+            temporal_params.append(param)
         else:
             other_params.append(param)
     param_groups = [
+        {'params': backbone_params, 'lr': lr * backbone_lr_scale, 'name': 'backbone'},
+        {'params': head_params, 'lr': lr, 'name': 'heads'},
+        {'params': temporal_params, 'lr': lr * 0.5, 'name': 'temporal'},
+        {'params': other_params, 'lr': lr * 0.5, 'name': 'other'},
     ]
+    # Filter empty groups
+    param_groups = [g for g in param_groups if len(g['params']) > 0]
     return optim.AdamW(param_groups, lr=lr, weight_decay=weight_decay, betas=(0.9, 0.999))
+def build_loss_optimizer(loss_fn, lr=1e-3):
+    """Separate optimizer for ADW loss weights (if trainable)."""
+    loss_params = [p for p in loss_fn.parameters() if p.requires_grad]
+    if loss_params:
+        return optim.Adam(loss_params, lr=lr)
+    return None
 def build_scheduler(optimizer, total_epochs, warmup_epochs=5):
     """Cosine annealing with linear warmup."""
     def lr_lambda(epoch):
         if epoch < warmup_epochs:
+            return max(0.01, epoch / warmup_epochs)
+        progress = (epoch - warmup_epochs) / max(1, total_epochs - warmup_epochs)
         return 0.5 * (1 + math.cos(math.pi * progress))
     return optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
 def train_one_epoch(
+    model, dataloader, optimizer, loss_optimizer, scaler, loss_fn, device,
     epoch, total_epochs, acl_lambda=None, grad_clip=1.0,
+    use_temporal=False, contrastive_loss=None, contrastive_weight=0.1,
 ):
+    """Train for one epoch with AMP, gradient clipping, and optional temporal training.
+    Args:
+        model: ViLTracker instance
+        dataloader: training data loader
+        optimizer: model optimizer
+        loss_optimizer: separate optimizer for ADW loss weights (can be None)
+        scaler: GradScaler for AMP (None if cpu)
+        loss_fn: CombinedTrackingLoss instance
+        device: 'cuda' or 'cpu'
+        epoch: current epoch number
+        total_epochs: total number of epochs
+        acl_lambda: ACL difficulty weight for loss scaling
+        grad_clip: max gradient norm
+        use_temporal: whether to use FiLM temporal modulation
+        contrastive_loss: optional MemoryContrastiveLoss for Phase 2
+        contrastive_weight: weight for contrastive loss
+    """
     model.train()
     total_loss = 0
+    total_heatmap_loss = 0
+    total_giou_loss = 0
+    total_size_loss = 0
+    total_contrastive_loss = 0
     num_batches = 0
     for batch_idx, batch in enumerate(dataloader):
         gt_boxes = batch['boxes'].to(device)
         optimizer.zero_grad()
+        if loss_optimizer is not None:
+            loss_optimizer.zero_grad()
         with autocast(enabled=scaler is not None):
+            # Forward pass with optional temporal modulation
+            pred = model(template, search, use_temporal=use_temporal)
             loss_dict = loss_fn(pred, gt_heatmap, gt_size, gt_boxes)
             loss = loss_dict['total']
+            # Contrastive loss on template/search features (Phase 2)
+            if contrastive_loss is not None and 'template_feat' in pred and 'search_feat' in pred:
+                # Pool features to get sequence-level representations
+                t_pooled = pred['template_feat'].mean(dim=1)   # (B, D)
+                s_pooled = pred['search_feat'].mean(dim=1)     # (B, D)
+                c_loss = contrastive_loss(t_pooled, s_pooled)
+                loss = loss + contrastive_weight * c_loss
+                total_contrastive_loss += c_loss.item()
             # ACL difficulty weighting
             if acl_lambda is not None:
                 loss = loss * acl_lambda
             scaler.unscale_(optimizer)
             nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
             scaler.step(optimizer)
+            if loss_optimizer is not None:
+                scaler.unscale_(loss_optimizer)
+                scaler.step(loss_optimizer)
             scaler.update()
         else:
             loss.backward()
             nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
             optimizer.step()
+            if loss_optimizer is not None:
+                loss_optimizer.step()
         total_loss += loss.item()
+        total_heatmap_loss += loss_dict['heatmap'].item()
+        total_giou_loss += loss_dict['giou'].item()
+        total_size_loss += loss_dict['size'].item()
         num_batches += 1
         if batch_idx % 100 == 0:
+            msg = (f"  Epoch {epoch}/{total_epochs} | Batch {batch_idx} | "
+                   f"Loss: {loss.item():.4f} | "
+                   f"Heatmap: {loss_dict['heatmap']:.4f} | "
+                   f"GIoU: {loss_dict['giou']:.4f} | "
+                   f"Size: {loss_dict['size']:.4f}")
+            if contrastive_loss is not None and total_contrastive_loss > 0:
+                msg += f" | Contr: {total_contrastive_loss / max(1, num_batches):.4f}"
+            print(msg)
+    n = max(num_batches, 1)
+    return {
+        'total': total_loss / n,
+        'heatmap': total_heatmap_loss / n,
+        'giou': total_giou_loss / n,
+        'size': total_size_loss / n,
+        'contrastive': total_contrastive_loss / n if total_contrastive_loss > 0 else 0,
+    }
 def train_phase1(
     num_epochs=300, lr=1e-4, batch_size=32, num_workers=4,
     save_dir='./checkpoints', push_to_hub=False, hub_model_id=None,
 ):
+    """Phase 1: Standard supervised training with ACL curriculum.
+    ACL Curriculum:
+    - Epoch 0-50: difficulty ramps from 0→1 (easy to hard samples)
+    - Loss weighting: acl_lambda ramps from 0.5→1.0
+    - Dataset augmentation intensity increases with difficulty
+    FiLM temporal modulation:
+    - Starts training after epoch 30 (model needs basic features first)
+    - Activated for 50% of batches initially, 100% after epoch 100
+    """
     print(f"=== Phase 1 Training: {num_epochs} epochs ===")
     os.makedirs(save_dir, exist_ok=True)
     model = model.to(device)
     optimizer = build_optimizer(model, lr=lr)
+    loss_optimizer = build_loss_optimizer(loss_fn)
     scheduler = build_scheduler(optimizer, num_epochs)
     scaler = GradScaler() if device == 'cuda' else None
     best_loss = float('inf')
     for epoch in range(num_epochs):
+        # ACL curriculum: progressive difficulty ramp-up
+        acl_progress = min(1.0, (epoch + 1) / 50)  # Linear ramp over 50 epochs
+        acl_lambda = 0.5 + 0.5 * acl_progress  # Loss weight: 0.5 → 1.0
+        # Update dataset difficulty (if supported)
+        if hasattr(train_dataset, 'set_acl_difficulty'):
+            train_dataset.set_acl_difficulty(acl_progress)
+        elif hasattr(train_dataset, 'datasets'):
+            # ConcatDataset: update all sub-datasets
+            for ds in train_dataset.datasets:
+                if hasattr(ds, 'set_acl_difficulty'):
+                    ds.set_acl_difficulty(acl_progress)
+        # FiLM temporal modulation schedule
+        use_temporal = epoch >= 30  # Start FiLM after 30 epochs
+        loss_metrics = train_one_epoch(
+            model, dataloader, optimizer, loss_optimizer, scaler, loss_fn,
             device, epoch, num_epochs, acl_lambda=acl_lambda,
+            use_temporal=use_temporal,
         )
         scheduler.step()
+        # Reset temporal state between epochs (each epoch starts fresh sequences)
+        model.reset_temporal()
+        print(f"Epoch {epoch}/{num_epochs} | "
+              f"Loss: {loss_metrics['total']:.4f} | "
+              f"Heatmap: {loss_metrics['heatmap']:.4f} | "
+              f"GIoU: {loss_metrics['giou']:.4f} | "
+              f"Size: {loss_metrics['size']:.4f} | "
+              f"LR: {scheduler.get_last_lr()[0]:.6f} | "
+              f"ACL: {acl_progress:.2f} | "
+              f"Temporal: {'ON' if use_temporal else 'OFF'}")
         # Save best
+        if loss_metrics['total'] < best_loss:
+            best_loss = loss_metrics['total']
             torch.save({
                 'epoch': epoch,
                 'model_state_dict': model.state_dict(),
                 'optimizer_state_dict': optimizer.state_dict(),
                 'loss': best_loss,
+                'config': config,
             }, os.path.join(save_dir, 'best_phase1.pth'))
         # Save periodic
                 'epoch': epoch,
                 'model_state_dict': model.state_dict(),
                 'optimizer_state_dict': optimizer.state_dict(),
+                'loss': loss_metrics['total'],
+                'config': config,
             }, os.path.join(save_dir, f'phase1_epoch{epoch+1}.pth'))
     if push_to_hub and hub_model_id:
     model, train_dataset, config, device='cuda',
     num_epochs=100, lr=1e-5, batch_size=32, num_workers=4,
     save_dir='./checkpoints', push_to_hub=False, hub_model_id=None,
+    teacher_model=None,
 ):
+    """Phase 2: Fine-tuning with frozen shared experts, contrastive loss, and distillation.
+    Changes from Phase 1:
+    1. Shared experts in TMoE blocks are frozen
+    2. Contrastive loss on template/search features (temporal consistency)
+    3. FiLM temporal modulation always active
+    4. Optional AFKD knowledge distillation from teacher model
+    5. Lower learning rate, especially for backbone
+    """
     print(f"=== Phase 2 Training: {num_epochs} epochs ===")
+    # Freeze shared experts in TMoE blocks
     model.freeze_backbone_shared_experts()
+    frozen_count = sum(1 for p in model.parameters() if not p.requires_grad)
+    total_count = sum(1 for p in model.parameters())
+    print(f"  Frozen parameters: {frozen_count}/{total_count}")
+    from .losses import CombinedTrackingLoss, MemoryContrastiveLoss, AFKDDistillationLoss
     loss_fn = CombinedTrackingLoss(use_uncertainty=True, use_adw=True).to(device)
+    contrastive_loss = MemoryContrastiveLoss(temperature=0.1).to(device)
+    # Optional distillation loss
+    distill_loss = None
+    if teacher_model is not None:
+        teacher_model = teacher_model.to(device)
+        teacher_model.eval()
+        for p in teacher_model.parameters():
+            p.requires_grad = False
+        distill_loss = AFKDDistillationLoss(
+            student_dim=config['dim'], teacher_dim=768, temperature=4.0
+        ).to(device)
+        print("  AFKD distillation enabled (teacher → student)")
     model = model.to(device)
     optimizer = build_optimizer(model, lr=lr, backbone_lr_scale=0.01)
+    loss_optimizer = build_loss_optimizer(loss_fn)
     scheduler = build_scheduler(optimizer, num_epochs, warmup_epochs=2)
     scaler = GradScaler() if device == 'cuda' else None
     best_loss = float('inf')
     for epoch in range(num_epochs):
+        model.train()
+        total_loss = 0
+        num_batches = 0
+        for batch_idx, batch in enumerate(dataloader):
+            template = batch['template'].to(device)
+            search = batch['search'].to(device)
+            gt_heatmap = batch['heatmap'].to(device)
+            gt_size = batch['size'].to(device)
+            gt_boxes = batch['boxes'].to(device)
+            optimizer.zero_grad()
+            if loss_optimizer is not None:
+                loss_optimizer.zero_grad()
+            with autocast(enabled=scaler is not None):
+                # Always use temporal modulation in Phase 2
+                pred = model(template, search, use_temporal=True)
+                loss_dict = loss_fn(pred, gt_heatmap, gt_size, gt_boxes)
+                loss = loss_dict['total']
+                # Contrastive loss on temporal features
+                t_pooled = pred['template_feat'].mean(dim=1)
+                s_pooled = pred['search_feat'].mean(dim=1)
+                c_loss = contrastive_loss(t_pooled, s_pooled)
+                loss = loss + 0.1 * c_loss
+                # AFKD distillation loss (if teacher available)
+                if distill_loss is not None and teacher_model is not None:
+                    with torch.no_grad():
+                        teacher_pred = teacher_model(template, search)
+                    d_loss = distill_loss(
+                        student_feat=pred['search_feat'],
+                        teacher_feat=teacher_pred['search_feat'],
+                        student_logits=pred['heatmap'],
+                        teacher_logits=teacher_pred['heatmap'],
+                    )
+                    loss = loss + 0.5 * d_loss
+            if scaler is not None:
+                scaler.scale(loss).backward()
+                scaler.unscale_(optimizer)
+                nn.utils.clip_grad_norm_(model.parameters(), grad_clip=1.0)
+                scaler.step(optimizer)
+                if loss_optimizer is not None:
+                    scaler.unscale_(loss_optimizer)
+                    scaler.step(loss_optimizer)
+                scaler.update()
+            else:
+                loss.backward()
+                nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+                optimizer.step()
+                if loss_optimizer is not None:
+                    loss_optimizer.step()
+            total_loss += loss.item()
+            num_batches += 1
+            if batch_idx % 100 == 0:
+                msg = (f"  Phase2 Epoch {epoch}/{num_epochs} | Batch {batch_idx} | "
+                       f"Loss: {loss.item():.4f} | "
+                       f"Heatmap: {loss_dict['heatmap']:.4f} | "
+                       f"GIoU: {loss_dict['giou']:.4f} | "
+                       f"Contr: {c_loss.item():.4f}")
+                if distill_loss is not None:
+                    msg += f" | Distill: {d_loss.item():.4f}"
+                print(msg)
         scheduler.step()
+        model.reset_temporal()  # Reset between epochs
+        avg_loss = total_loss / max(num_batches, 1)
         print(f"Phase2 Epoch {epoch}/{num_epochs} | Avg Loss: {avg_loss:.4f} | "
               f"LR: {scheduler.get_last_lr()[0]:.6f}")
                 'epoch': epoch,
                 'model_state_dict': model.state_dict(),
                 'loss': best_loss,
+                'config': config,
             }, os.path.join(save_dir, 'best_phase2.pth'))
+        if (epoch + 1) % 25 == 0:
+            torch.save({
+                'epoch': epoch,
+                'model_state_dict': model.state_dict(),
+                'loss': avg_loss,
+                'config': config,
+            }, os.path.join(save_dir, f'phase2_epoch{epoch+1}.pth'))
     if push_to_hub and hub_model_id:
         _push_checkpoint_to_hub(model, save_dir, hub_model_id, 'phase2')