Spaces:
Running
Running
Fix: DeBERTa-v3 fp16 crash on T4 — use fp32, batch=2, grad_accum=16
Browse files
ml/ClauseGuard_DeBERTa_Training.ipynb
CHANGED
|
@@ -103,14 +103,14 @@
|
|
| 103 |
"# Stage 1: LEDGAR config\n",
|
| 104 |
"STAGE1_EPOCHS = 5 # LEDGAR is large, converges fast\n",
|
| 105 |
"STAGE1_LR = 2e-5\n",
|
| 106 |
-
"STAGE1_BATCH =
|
| 107 |
-
"STAGE1_GRAD_ACCUM =
|
| 108 |
"\n",
|
| 109 |
"# Stage 2: CUAD config \n",
|
| 110 |
"STAGE2_EPOCHS = 20\n",
|
| 111 |
"STAGE2_LR = 1e-5 # lower LR for fine-tuning pretrained model\n",
|
| 112 |
-
"STAGE2_BATCH =
|
| 113 |
-
"STAGE2_GRAD_ACCUM =
|
| 114 |
"EARLY_STOPPING_PATIENCE = 3\n",
|
| 115 |
"\n",
|
| 116 |
"# ASL hyperparameters (from arxiv 2009.14119)\n",
|
|
@@ -593,7 +593,7 @@
|
|
| 593 |
" output_dir=\"./stage1_ledgar\",\n",
|
| 594 |
" num_train_epochs=STAGE1_EPOCHS,\n",
|
| 595 |
" per_device_train_batch_size=STAGE1_BATCH,\n",
|
| 596 |
-
" per_device_eval_batch_size=
|
| 597 |
" gradient_accumulation_steps=STAGE1_GRAD_ACCUM,\n",
|
| 598 |
" learning_rate=STAGE1_LR,\n",
|
| 599 |
" weight_decay=WEIGHT_DECAY,\n",
|
|
@@ -605,12 +605,12 @@
|
|
| 605 |
" load_best_model_at_end=True,\n",
|
| 606 |
" metric_for_best_model=\"macro_f1\",\n",
|
| 607 |
" greater_is_better=True,\n",
|
| 608 |
-
" bf16=
|
| 609 |
-
" fp16=
|
| 610 |
" logging_strategy=\"steps\",\n",
|
| 611 |
" logging_steps=50,\n",
|
| 612 |
" logging_first_step=True,\n",
|
| 613 |
-
" disable_tqdm=False,
|
| 614 |
" report_to=\"none\",\n",
|
| 615 |
" dataloader_num_workers=2,\n",
|
| 616 |
" seed=SEED,\n",
|
|
@@ -725,7 +725,7 @@
|
|
| 725 |
" output_dir=\"./stage2_cuad\",\n",
|
| 726 |
" num_train_epochs=STAGE2_EPOCHS,\n",
|
| 727 |
" per_device_train_batch_size=STAGE2_BATCH,\n",
|
| 728 |
-
" per_device_eval_batch_size=
|
| 729 |
" gradient_accumulation_steps=STAGE2_GRAD_ACCUM,\n",
|
| 730 |
" learning_rate=STAGE2_LR,\n",
|
| 731 |
" weight_decay=WEIGHT_DECAY,\n",
|
|
@@ -737,8 +737,8 @@
|
|
| 737 |
" load_best_model_at_end=True,\n",
|
| 738 |
" metric_for_best_model=\"macro_f1\",\n",
|
| 739 |
" greater_is_better=True,\n",
|
| 740 |
-
" bf16=
|
| 741 |
-
" fp16=
|
| 742 |
" logging_strategy=\"steps\",\n",
|
| 743 |
" logging_steps=25,\n",
|
| 744 |
" logging_first_step=True,\n",
|
|
|
|
| 103 |
"# Stage 1: LEDGAR config\n",
|
| 104 |
"STAGE1_EPOCHS = 5 # LEDGAR is large, converges fast\n",
|
| 105 |
"STAGE1_LR = 2e-5\n",
|
| 106 |
+
"STAGE1_BATCH = 2 # T4 fp32: reduced for DeBERTa-v3 compatibility\n",
|
| 107 |
+
"STAGE1_GRAD_ACCUM = 16 # effective batch = 32 (2 * 16)\n",
|
| 108 |
"\n",
|
| 109 |
"# Stage 2: CUAD config \n",
|
| 110 |
"STAGE2_EPOCHS = 20\n",
|
| 111 |
"STAGE2_LR = 1e-5 # lower LR for fine-tuning pretrained model\n",
|
| 112 |
+
"STAGE2_BATCH = 2 # T4 fp32: reduced for DeBERTa-v3 compatibility\n",
|
| 113 |
+
"STAGE2_GRAD_ACCUM = 16 # effective batch = 32 (2 * 16)\n",
|
| 114 |
"EARLY_STOPPING_PATIENCE = 3\n",
|
| 115 |
"\n",
|
| 116 |
"# ASL hyperparameters (from arxiv 2009.14119)\n",
|
|
|
|
| 593 |
" output_dir=\"./stage1_ledgar\",\n",
|
| 594 |
" num_train_epochs=STAGE1_EPOCHS,\n",
|
| 595 |
" per_device_train_batch_size=STAGE1_BATCH,\n",
|
| 596 |
+
" per_device_eval_batch_size=4,\n",
|
| 597 |
" gradient_accumulation_steps=STAGE1_GRAD_ACCUM,\n",
|
| 598 |
" learning_rate=STAGE1_LR,\n",
|
| 599 |
" weight_decay=WEIGHT_DECAY,\n",
|
|
|
|
| 605 |
" load_best_model_at_end=True,\n",
|
| 606 |
" metric_for_best_model=\"macro_f1\",\n",
|
| 607 |
" greater_is_better=True,\n",
|
| 608 |
+
" bf16=False, # DeBERTa-v3 breaks with fp16 gradient scaler; fp32 is safest on T4\n",
|
| 609 |
+
" fp16=False,\n",
|
| 610 |
" logging_strategy=\"steps\",\n",
|
| 611 |
" logging_steps=50,\n",
|
| 612 |
" logging_first_step=True,\n",
|
| 613 |
+
" disable_tqdm=False,\n",
|
| 614 |
" report_to=\"none\",\n",
|
| 615 |
" dataloader_num_workers=2,\n",
|
| 616 |
" seed=SEED,\n",
|
|
|
|
| 725 |
" output_dir=\"./stage2_cuad\",\n",
|
| 726 |
" num_train_epochs=STAGE2_EPOCHS,\n",
|
| 727 |
" per_device_train_batch_size=STAGE2_BATCH,\n",
|
| 728 |
+
" per_device_eval_batch_size=4,\n",
|
| 729 |
" gradient_accumulation_steps=STAGE2_GRAD_ACCUM,\n",
|
| 730 |
" learning_rate=STAGE2_LR,\n",
|
| 731 |
" weight_decay=WEIGHT_DECAY,\n",
|
|
|
|
| 737 |
" load_best_model_at_end=True,\n",
|
| 738 |
" metric_for_best_model=\"macro_f1\",\n",
|
| 739 |
" greater_is_better=True,\n",
|
| 740 |
+
" bf16=False, # DeBERTa-v3 breaks with fp16 gradient scaler; fp32 is safest on T4\n",
|
| 741 |
+
" fp16=False,\n",
|
| 742 |
" logging_strategy=\"steps\",\n",
|
| 743 |
" logging_steps=25,\n",
|
| 744 |
" logging_first_step=True,\n",
|