Spaces:
Sleeping
Sleeping
Update notebook for T4 GPU: batch=4, grad_accum=8, eval_batch=8
Browse files
ml/ClauseGuard_DeBERTa_Training.ipynb
CHANGED
|
@@ -4,8 +4,7 @@
|
|
| 4 |
"metadata": {
|
| 5 |
"colab": {
|
| 6 |
"provenance": [],
|
| 7 |
-
"gpuType": "
|
| 8 |
-
"machine_shape": "hm"
|
| 9 |
},
|
| 10 |
"kernelspec": {
|
| 11 |
"name": "python3",
|
|
@@ -36,10 +35,10 @@
|
|
| 36 |
"1. **Stage 1 — LEDGAR** (60K legal provisions, 100 classes): Teaches \"what types of contract clauses exist\"\n",
|
| 37 |
"2. **Stage 2 — CUAD** (41 CUAD classes): Target task with Asymmetric Loss for class imbalance\n",
|
| 38 |
"\n",
|
| 39 |
-
"**Runtime:** ~4-6 hours on A100
|
| 40 |
"\n",
|
| 41 |
"**Before running:**\n",
|
| 42 |
-
"1. `Runtime` → `Change runtime type` → **
|
| 43 |
"2. `Runtime` → `Run all`\n",
|
| 44 |
"3. Paste your HuggingFace token when prompted"
|
| 45 |
],
|
|
@@ -104,14 +103,14 @@
|
|
| 104 |
"# Stage 1: LEDGAR config\n",
|
| 105 |
"STAGE1_EPOCHS = 5 # LEDGAR is large, converges fast\n",
|
| 106 |
"STAGE1_LR = 2e-5\n",
|
| 107 |
-
"STAGE1_BATCH = 8\n",
|
| 108 |
-
"STAGE1_GRAD_ACCUM =
|
| 109 |
"\n",
|
| 110 |
"# Stage 2: CUAD config \n",
|
| 111 |
"STAGE2_EPOCHS = 20\n",
|
| 112 |
"STAGE2_LR = 1e-5 # lower LR for fine-tuning pretrained model\n",
|
| 113 |
-
"STAGE2_BATCH = 8\n",
|
| 114 |
-
"STAGE2_GRAD_ACCUM =
|
| 115 |
"EARLY_STOPPING_PATIENCE = 3\n",
|
| 116 |
"\n",
|
| 117 |
"# ASL hyperparameters (from arxiv 2009.14119)\n",
|
|
@@ -559,7 +558,7 @@
|
|
| 559 |
"\n",
|
| 560 |
"This stage uses standard cross-entropy loss since LEDGAR is well-balanced.\n",
|
| 561 |
"\n",
|
| 562 |
-
"**Expected:** ~85-90% micro-F1 after 3-5 epochs (~1-2 hours on A100)"
|
| 563 |
],
|
| 564 |
"metadata": {}
|
| 565 |
},
|
|
@@ -594,7 +593,7 @@
|
|
| 594 |
" output_dir=\"./stage1_ledgar\",\n",
|
| 595 |
" num_train_epochs=STAGE1_EPOCHS,\n",
|
| 596 |
" per_device_train_batch_size=STAGE1_BATCH,\n",
|
| 597 |
-
" per_device_eval_batch_size=
|
| 598 |
" gradient_accumulation_steps=STAGE1_GRAD_ACCUM,\n",
|
| 599 |
" learning_rate=STAGE1_LR,\n",
|
| 600 |
" weight_decay=WEIGHT_DECAY,\n",
|
|
@@ -615,7 +614,7 @@
|
|
| 615 |
" report_to=\"none\",\n",
|
| 616 |
" dataloader_num_workers=2,\n",
|
| 617 |
" seed=SEED,\n",
|
| 618 |
-
" gradient_checkpointing=True, #
|
| 619 |
")\n",
|
| 620 |
"\n",
|
| 621 |
"stage1_trainer = Trainer(\n",
|
|
@@ -672,7 +671,7 @@
|
|
| 672 |
"- Asymmetric Loss for class imbalance\n",
|
| 673 |
"- Full fine-tuning (no LoRA bottleneck)\n",
|
| 674 |
"\n",
|
| 675 |
-
"**Expected:** 75-87% macro-F1 after 10-20 epochs (~2-4 hours on A100)"
|
| 676 |
],
|
| 677 |
"metadata": {}
|
| 678 |
},
|
|
@@ -726,7 +725,7 @@
|
|
| 726 |
" output_dir=\"./stage2_cuad\",\n",
|
| 727 |
" num_train_epochs=STAGE2_EPOCHS,\n",
|
| 728 |
" per_device_train_batch_size=STAGE2_BATCH,\n",
|
| 729 |
-
" per_device_eval_batch_size=
|
| 730 |
" gradient_accumulation_steps=STAGE2_GRAD_ACCUM,\n",
|
| 731 |
" learning_rate=STAGE2_LR,\n",
|
| 732 |
" weight_decay=WEIGHT_DECAY,\n",
|
|
|
|
| 4 |
"metadata": {
|
| 5 |
"colab": {
|
| 6 |
"provenance": [],
|
| 7 |
+
"gpuType": "T4"
|
|
|
|
| 8 |
},
|
| 9 |
"kernelspec": {
|
| 10 |
"name": "python3",
|
|
|
|
| 35 |
"1. **Stage 1 — LEDGAR** (60K legal provisions, 100 classes): Teaches \"what types of contract clauses exist\"\n",
|
| 36 |
"2. **Stage 2 — CUAD** (41 CUAD classes): Target task with Asymmetric Loss for class imbalance\n",
|
| 37 |
"\n",
|
| 38 |
+
"**Runtime:** ~8-12 hours on T4 GPU (or ~4-6 hours on A100)\n",
|
| 39 |
"\n",
|
| 40 |
"**Before running:**\n",
|
| 41 |
+
"1. `Runtime` → `Change runtime type` → **T4 GPU**\n",
|
| 42 |
"2. `Runtime` → `Run all`\n",
|
| 43 |
"3. Paste your HuggingFace token when prompted"
|
| 44 |
],
|
|
|
|
| 103 |
"# Stage 1: LEDGAR config\n",
|
| 104 |
"STAGE1_EPOCHS = 5 # LEDGAR is large, converges fast\n",
|
| 105 |
"STAGE1_LR = 2e-5\n",
|
| 106 |
+
"STAGE1_BATCH = 4 # T4: reduced from 8 (16GB VRAM)\n",
|
| 107 |
+
"STAGE1_GRAD_ACCUM = 8 # effective batch = 32 (4 * 8)\n",
|
| 108 |
"\n",
|
| 109 |
"# Stage 2: CUAD config \n",
|
| 110 |
"STAGE2_EPOCHS = 20\n",
|
| 111 |
"STAGE2_LR = 1e-5 # lower LR for fine-tuning pretrained model\n",
|
| 112 |
+
"STAGE2_BATCH = 4 # T4: reduced from 8\n",
|
| 113 |
+
"STAGE2_GRAD_ACCUM = 8 # effective batch = 32 (4 * 8)\n",
|
| 114 |
"EARLY_STOPPING_PATIENCE = 3\n",
|
| 115 |
"\n",
|
| 116 |
"# ASL hyperparameters (from arxiv 2009.14119)\n",
|
|
|
|
| 558 |
"\n",
|
| 559 |
"This stage uses standard cross-entropy loss since LEDGAR is well-balanced.\n",
|
| 560 |
"\n",
|
| 561 |
+
"**Expected:** ~85-90% micro-F1 after 3-5 epochs (~3-5 hours on T4, ~1-2 hours on A100)"
|
| 562 |
],
|
| 563 |
"metadata": {}
|
| 564 |
},
|
|
|
|
| 593 |
" output_dir=\"./stage1_ledgar\",\n",
|
| 594 |
" num_train_epochs=STAGE1_EPOCHS,\n",
|
| 595 |
" per_device_train_batch_size=STAGE1_BATCH,\n",
|
| 596 |
+
" per_device_eval_batch_size=8,\n",
|
| 597 |
" gradient_accumulation_steps=STAGE1_GRAD_ACCUM,\n",
|
| 598 |
" learning_rate=STAGE1_LR,\n",
|
| 599 |
" weight_decay=WEIGHT_DECAY,\n",
|
|
|
|
| 614 |
" report_to=\"none\",\n",
|
| 615 |
" dataloader_num_workers=2,\n",
|
| 616 |
" seed=SEED,\n",
|
| 617 |
+
" gradient_checkpointing=True, # Critical for T4 (16GB VRAM)\n",
|
| 618 |
")\n",
|
| 619 |
"\n",
|
| 620 |
"stage1_trainer = Trainer(\n",
|
|
|
|
| 671 |
"- Asymmetric Loss for class imbalance\n",
|
| 672 |
"- Full fine-tuning (no LoRA bottleneck)\n",
|
| 673 |
"\n",
|
| 674 |
+
"**Expected:** 75-87% macro-F1 after 10-20 epochs (~5-8 hours on T4, ~2-4 hours on A100)"
|
| 675 |
],
|
| 676 |
"metadata": {}
|
| 677 |
},
|
|
|
|
| 725 |
" output_dir=\"./stage2_cuad\",\n",
|
| 726 |
" num_train_epochs=STAGE2_EPOCHS,\n",
|
| 727 |
" per_device_train_batch_size=STAGE2_BATCH,\n",
|
| 728 |
+
" per_device_eval_batch_size=8,\n",
|
| 729 |
" gradient_accumulation_steps=STAGE2_GRAD_ACCUM,\n",
|
| 730 |
" learning_rate=STAGE2_LR,\n",
|
| 731 |
" weight_decay=WEIGHT_DECAY,\n",
|