gaurv007 commited on
Commit
55761de
·
verified ·
1 Parent(s): a4cb2c1

Update notebook for T4 GPU: batch=4, grad_accum=8, eval_batch=8

Browse files
Files changed (1) hide show
  1. ml/ClauseGuard_DeBERTa_Training.ipynb +12 -13
ml/ClauseGuard_DeBERTa_Training.ipynb CHANGED
@@ -4,8 +4,7 @@
4
  "metadata": {
5
  "colab": {
6
  "provenance": [],
7
- "gpuType": "A100",
8
- "machine_shape": "hm"
9
  },
10
  "kernelspec": {
11
  "name": "python3",
@@ -36,10 +35,10 @@
36
  "1. **Stage 1 — LEDGAR** (60K legal provisions, 100 classes): Teaches \"what types of contract clauses exist\"\n",
37
  "2. **Stage 2 — CUAD** (41 CUAD classes): Target task with Asymmetric Loss for class imbalance\n",
38
  "\n",
39
- "**Runtime:** ~4-6 hours on A100 GPU\n",
40
  "\n",
41
  "**Before running:**\n",
42
- "1. `Runtime` → `Change runtime type` → **A100 GPU** (High-RAM if available)\n",
43
  "2. `Runtime` → `Run all`\n",
44
  "3. Paste your HuggingFace token when prompted"
45
  ],
@@ -104,14 +103,14 @@
104
  "# Stage 1: LEDGAR config\n",
105
  "STAGE1_EPOCHS = 5 # LEDGAR is large, converges fast\n",
106
  "STAGE1_LR = 2e-5\n",
107
- "STAGE1_BATCH = 8\n",
108
- "STAGE1_GRAD_ACCUM = 4 # effective batch = 32\n",
109
  "\n",
110
  "# Stage 2: CUAD config \n",
111
  "STAGE2_EPOCHS = 20\n",
112
  "STAGE2_LR = 1e-5 # lower LR for fine-tuning pretrained model\n",
113
- "STAGE2_BATCH = 8\n",
114
- "STAGE2_GRAD_ACCUM = 4 # effective batch = 32\n",
115
  "EARLY_STOPPING_PATIENCE = 3\n",
116
  "\n",
117
  "# ASL hyperparameters (from arxiv 2009.14119)\n",
@@ -559,7 +558,7 @@
559
  "\n",
560
  "This stage uses standard cross-entropy loss since LEDGAR is well-balanced.\n",
561
  "\n",
562
- "**Expected:** ~85-90% micro-F1 after 3-5 epochs (~1-2 hours on A100)"
563
  ],
564
  "metadata": {}
565
  },
@@ -594,7 +593,7 @@
594
  " output_dir=\"./stage1_ledgar\",\n",
595
  " num_train_epochs=STAGE1_EPOCHS,\n",
596
  " per_device_train_batch_size=STAGE1_BATCH,\n",
597
- " per_device_eval_batch_size=16,\n",
598
  " gradient_accumulation_steps=STAGE1_GRAD_ACCUM,\n",
599
  " learning_rate=STAGE1_LR,\n",
600
  " weight_decay=WEIGHT_DECAY,\n",
@@ -615,7 +614,7 @@
615
  " report_to=\"none\",\n",
616
  " dataloader_num_workers=2,\n",
617
  " seed=SEED,\n",
618
- " gradient_checkpointing=True, # Save VRAM on A100\n",
619
  ")\n",
620
  "\n",
621
  "stage1_trainer = Trainer(\n",
@@ -672,7 +671,7 @@
672
  "- Asymmetric Loss for class imbalance\n",
673
  "- Full fine-tuning (no LoRA bottleneck)\n",
674
  "\n",
675
- "**Expected:** 75-87% macro-F1 after 10-20 epochs (~2-4 hours on A100)"
676
  ],
677
  "metadata": {}
678
  },
@@ -726,7 +725,7 @@
726
  " output_dir=\"./stage2_cuad\",\n",
727
  " num_train_epochs=STAGE2_EPOCHS,\n",
728
  " per_device_train_batch_size=STAGE2_BATCH,\n",
729
- " per_device_eval_batch_size=16,\n",
730
  " gradient_accumulation_steps=STAGE2_GRAD_ACCUM,\n",
731
  " learning_rate=STAGE2_LR,\n",
732
  " weight_decay=WEIGHT_DECAY,\n",
 
4
  "metadata": {
5
  "colab": {
6
  "provenance": [],
7
+ "gpuType": "T4"
 
8
  },
9
  "kernelspec": {
10
  "name": "python3",
 
35
  "1. **Stage 1 — LEDGAR** (60K legal provisions, 100 classes): Teaches \"what types of contract clauses exist\"\n",
36
  "2. **Stage 2 — CUAD** (41 CUAD classes): Target task with Asymmetric Loss for class imbalance\n",
37
  "\n",
38
+ "**Runtime:** ~8-12 hours on T4 GPU (or ~4-6 hours on A100)\n",
39
  "\n",
40
  "**Before running:**\n",
41
+ "1. `Runtime` → `Change runtime type` → **T4 GPU**\n",
42
  "2. `Runtime` → `Run all`\n",
43
  "3. Paste your HuggingFace token when prompted"
44
  ],
 
103
  "# Stage 1: LEDGAR config\n",
104
  "STAGE1_EPOCHS = 5 # LEDGAR is large, converges fast\n",
105
  "STAGE1_LR = 2e-5\n",
106
+ "STAGE1_BATCH = 4 # T4: reduced from 8 (16GB VRAM)\n",
107
+ "STAGE1_GRAD_ACCUM = 8 # effective batch = 32 (4 * 8)\n",
108
  "\n",
109
  "# Stage 2: CUAD config \n",
110
  "STAGE2_EPOCHS = 20\n",
111
  "STAGE2_LR = 1e-5 # lower LR for fine-tuning pretrained model\n",
112
+ "STAGE2_BATCH = 4 # T4: reduced from 8\n",
113
+ "STAGE2_GRAD_ACCUM = 8 # effective batch = 32 (4 * 8)\n",
114
  "EARLY_STOPPING_PATIENCE = 3\n",
115
  "\n",
116
  "# ASL hyperparameters (from arxiv 2009.14119)\n",
 
558
  "\n",
559
  "This stage uses standard cross-entropy loss since LEDGAR is well-balanced.\n",
560
  "\n",
561
+ "**Expected:** ~85-90% micro-F1 after 3-5 epochs (~3-5 hours on T4, ~1-2 hours on A100)"
562
  ],
563
  "metadata": {}
564
  },
 
593
  " output_dir=\"./stage1_ledgar\",\n",
594
  " num_train_epochs=STAGE1_EPOCHS,\n",
595
  " per_device_train_batch_size=STAGE1_BATCH,\n",
596
+ " per_device_eval_batch_size=8,\n",
597
  " gradient_accumulation_steps=STAGE1_GRAD_ACCUM,\n",
598
  " learning_rate=STAGE1_LR,\n",
599
  " weight_decay=WEIGHT_DECAY,\n",
 
614
  " report_to=\"none\",\n",
615
  " dataloader_num_workers=2,\n",
616
  " seed=SEED,\n",
617
+ " gradient_checkpointing=True, # Critical for T4 (16GB VRAM)\n",
618
  ")\n",
619
  "\n",
620
  "stage1_trainer = Trainer(\n",
 
671
  "- Asymmetric Loss for class imbalance\n",
672
  "- Full fine-tuning (no LoRA bottleneck)\n",
673
  "\n",
674
+ "**Expected:** 75-87% macro-F1 after 10-20 epochs (~5-8 hours on T4, ~2-4 hours on A100)"
675
  ],
676
  "metadata": {}
677
  },
 
725
  " output_dir=\"./stage2_cuad\",\n",
726
  " num_train_epochs=STAGE2_EPOCHS,\n",
727
  " per_device_train_batch_size=STAGE2_BATCH,\n",
728
+ " per_device_eval_batch_size=8,\n",
729
  " gradient_accumulation_steps=STAGE2_GRAD_ACCUM,\n",
730
  " learning_rate=STAGE2_LR,\n",
731
  " weight_decay=WEIGHT_DECAY,\n",