gaurv007 commited on
Commit
970316e
·
verified ·
1 Parent(s): 55761de

Fix: DeBERTa-v3 fp16 crash on T4 — use fp32, batch=2, grad_accum=16

Browse files
Files changed (1) hide show
  1. ml/ClauseGuard_DeBERTa_Training.ipynb +11 -11
ml/ClauseGuard_DeBERTa_Training.ipynb CHANGED
@@ -103,14 +103,14 @@
103
  "# Stage 1: LEDGAR config\n",
104
  "STAGE1_EPOCHS = 5 # LEDGAR is large, converges fast\n",
105
  "STAGE1_LR = 2e-5\n",
106
- "STAGE1_BATCH = 4 # T4: reduced from 8 (16GB VRAM)\n",
107
- "STAGE1_GRAD_ACCUM = 8 # effective batch = 32 (4 * 8)\n",
108
  "\n",
109
  "# Stage 2: CUAD config \n",
110
  "STAGE2_EPOCHS = 20\n",
111
  "STAGE2_LR = 1e-5 # lower LR for fine-tuning pretrained model\n",
112
- "STAGE2_BATCH = 4 # T4: reduced from 8\n",
113
- "STAGE2_GRAD_ACCUM = 8 # effective batch = 32 (4 * 8)\n",
114
  "EARLY_STOPPING_PATIENCE = 3\n",
115
  "\n",
116
  "# ASL hyperparameters (from arxiv 2009.14119)\n",
@@ -593,7 +593,7 @@
593
  " output_dir=\"./stage1_ledgar\",\n",
594
  " num_train_epochs=STAGE1_EPOCHS,\n",
595
  " per_device_train_batch_size=STAGE1_BATCH,\n",
596
- " per_device_eval_batch_size=8,\n",
597
  " gradient_accumulation_steps=STAGE1_GRAD_ACCUM,\n",
598
  " learning_rate=STAGE1_LR,\n",
599
  " weight_decay=WEIGHT_DECAY,\n",
@@ -605,12 +605,12 @@
605
  " load_best_model_at_end=True,\n",
606
  " metric_for_best_model=\"macro_f1\",\n",
607
  " greater_is_better=True,\n",
608
- " bf16=torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8,\n",
609
- " fp16=torch.cuda.is_available() and torch.cuda.get_device_capability()[0] < 8,\n",
610
  " logging_strategy=\"steps\",\n",
611
  " logging_steps=50,\n",
612
  " logging_first_step=True,\n",
613
- " disable_tqdm=False, # Keep progress bar in Colab\n",
614
  " report_to=\"none\",\n",
615
  " dataloader_num_workers=2,\n",
616
  " seed=SEED,\n",
@@ -725,7 +725,7 @@
725
  " output_dir=\"./stage2_cuad\",\n",
726
  " num_train_epochs=STAGE2_EPOCHS,\n",
727
  " per_device_train_batch_size=STAGE2_BATCH,\n",
728
- " per_device_eval_batch_size=8,\n",
729
  " gradient_accumulation_steps=STAGE2_GRAD_ACCUM,\n",
730
  " learning_rate=STAGE2_LR,\n",
731
  " weight_decay=WEIGHT_DECAY,\n",
@@ -737,8 +737,8 @@
737
  " load_best_model_at_end=True,\n",
738
  " metric_for_best_model=\"macro_f1\",\n",
739
  " greater_is_better=True,\n",
740
- " bf16=torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8,\n",
741
- " fp16=torch.cuda.is_available() and torch.cuda.get_device_capability()[0] < 8,\n",
742
  " logging_strategy=\"steps\",\n",
743
  " logging_steps=25,\n",
744
  " logging_first_step=True,\n",
 
103
  "# Stage 1: LEDGAR config\n",
104
  "STAGE1_EPOCHS = 5 # LEDGAR is large, converges fast\n",
105
  "STAGE1_LR = 2e-5\n",
106
+ "STAGE1_BATCH = 2 # T4 fp32: reduced for DeBERTa-v3 compatibility\n",
107
+ "STAGE1_GRAD_ACCUM = 16 # effective batch = 32 (2 * 16)\n",
108
  "\n",
109
  "# Stage 2: CUAD config \n",
110
  "STAGE2_EPOCHS = 20\n",
111
  "STAGE2_LR = 1e-5 # lower LR for fine-tuning pretrained model\n",
112
+ "STAGE2_BATCH = 2 # T4 fp32: reduced for DeBERTa-v3 compatibility\n",
113
+ "STAGE2_GRAD_ACCUM = 16 # effective batch = 32 (2 * 16)\n",
114
  "EARLY_STOPPING_PATIENCE = 3\n",
115
  "\n",
116
  "# ASL hyperparameters (from arxiv 2009.14119)\n",
 
593
  " output_dir=\"./stage1_ledgar\",\n",
594
  " num_train_epochs=STAGE1_EPOCHS,\n",
595
  " per_device_train_batch_size=STAGE1_BATCH,\n",
596
+ " per_device_eval_batch_size=4,\n",
597
  " gradient_accumulation_steps=STAGE1_GRAD_ACCUM,\n",
598
  " learning_rate=STAGE1_LR,\n",
599
  " weight_decay=WEIGHT_DECAY,\n",
 
605
  " load_best_model_at_end=True,\n",
606
  " metric_for_best_model=\"macro_f1\",\n",
607
  " greater_is_better=True,\n",
608
+ " bf16=False, # DeBERTa-v3 breaks with fp16 gradient scaler; fp32 is safest on T4\n",
609
+ " fp16=False,\n",
610
  " logging_strategy=\"steps\",\n",
611
  " logging_steps=50,\n",
612
  " logging_first_step=True,\n",
613
+ " disable_tqdm=False,\n",
614
  " report_to=\"none\",\n",
615
  " dataloader_num_workers=2,\n",
616
  " seed=SEED,\n",
 
725
  " output_dir=\"./stage2_cuad\",\n",
726
  " num_train_epochs=STAGE2_EPOCHS,\n",
727
  " per_device_train_batch_size=STAGE2_BATCH,\n",
728
+ " per_device_eval_batch_size=4,\n",
729
  " gradient_accumulation_steps=STAGE2_GRAD_ACCUM,\n",
730
  " learning_rate=STAGE2_LR,\n",
731
  " weight_decay=WEIGHT_DECAY,\n",
 
737
  " load_best_model_at_end=True,\n",
738
  " metric_for_best_model=\"macro_f1\",\n",
739
  " greater_is_better=True,\n",
740
+ " bf16=False, # DeBERTa-v3 breaks with fp16 gradient scaler; fp32 is safest on T4\n",
741
+ " fp16=False,\n",
742
  " logging_strategy=\"steps\",\n",
743
  " logging_steps=25,\n",
744
  " logging_first_step=True,\n",