Spaces:

Prasham1710
/

ci-triage-training

Sleeping

Prasham.Jain Claude Sonnet 4.6 commited on about 1 month ago

Commit

8580936

1 Parent(s): 4647df7

feat(training): switch from LoRA to QLoRA per mentor recommendation

QLoRA = NF4-quantized frozen base + bf16 LoRA adapters trained on top.
unsloth's load_in_4bit=True already implements this; make it explicit:

- lora_alpha: 16 → 32 (= 2×r, follows the QLoRA paper scaling rule)
- lora_dropout: 0 → 0.05 (standard for QLoRA regularisation)
- Add module-level docstring explaining the QLoRA setup clearly
- Note: unsloth calls prepare_model_for_kbit_training() internally so
no manual call needed

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show

src/ci_triage_env/training/sft.py +18 -8

src/ci_triage_env/training/sft.py CHANGED Viewed

@@ -1,4 +1,10 @@
-"""SFT warmstart trainer — Qwen3-4B + LoRA via unsloth.
 All GPU-heavy imports (unsloth, trl, torch) are lazy so the module is
 importable without a GPU for testing.
@@ -6,8 +12,8 @@ importable without a GPU for testing.
 from __future__ import annotations
-# unsloth hosts optimised weights; the bnb-4bit variant skips on-the-fly quantisation
-# so it loads ~2x faster than the base float16 weights.
 MODEL_NAME = "unsloth/Qwen3-4B-bnb-4bit"
 MAX_SEQ_LEN = 8192
@@ -16,16 +22,20 @@ def load_model_for_sft(
     model_name: str = MODEL_NAME,
     max_seq_length: int = MAX_SEQ_LEN,
 ):
-    """Load Qwen3-4B with unsloth 4-bit + LoRA. Requires GPU and unsloth installed."""
     from unsloth import FastLanguageModel  # type: ignore[import]
     model, tokenizer = FastLanguageModel.from_pretrained(
         model_name=model_name,
         max_seq_length=max_seq_length,
         load_in_4bit=True,
-        dtype=None,          # auto — bfloat16 on Ampere+
     )
     model = FastLanguageModel.get_peft_model(
         model,
         r=16,
@@ -33,10 +43,10 @@ def load_model_for_sft(
             "q_proj", "k_proj", "v_proj", "o_proj",
             "gate_proj", "up_proj", "down_proj",
         ],
-        lora_alpha=16,
-        lora_dropout=0,
         bias="none",
-        use_gradient_checkpointing="unsloth",  # unsloth's gradient checkpointing is 30% faster
         random_state=3407,
     )
     return model, tokenizer

+"""SFT warmstart trainer — Qwen3-4B + QLoRA via unsloth.
+QLoRA setup:
+  - Base model loaded in NF4 4-bit (frozen) — the "Q" in QLoRA
+  - LoRA adapter matrices trained in bf16 on top — the "LoRA" part
+  - Double quantization enabled by default in unsloth for ~0.4 bit extra savings
+  - unsloth calls prepare_model_for_kbit_training() internally
 All GPU-heavy imports (unsloth, trl, torch) are lazy so the module is
 importable without a GPU for testing.
 from __future__ import annotations
+# unsloth hosts optimised weights; the bnb-4bit variant is pre-quantised to NF4
+# so it loads ~2x faster than quantising float16 on the fly.
 MODEL_NAME = "unsloth/Qwen3-4B-bnb-4bit"
 MAX_SEQ_LEN = 8192
     model_name: str = MODEL_NAME,
     max_seq_length: int = MAX_SEQ_LEN,
 ):
+    """Load Qwen3-4B with QLoRA (NF4 base + bf16 LoRA adapters) via unsloth."""
     from unsloth import FastLanguageModel  # type: ignore[import]
+    # load_in_4bit=True → base weights frozen in NF4 4-bit (QLoRA)
+    # dtype=None        → adapter compute dtype auto-selected (bf16 on Ampere+)
     model, tokenizer = FastLanguageModel.from_pretrained(
         model_name=model_name,
         max_seq_length=max_seq_length,
         load_in_4bit=True,
+        dtype=None,
     )
+    # QLoRA LoRA config: r=16 is standard for 4B models.
+    # lora_alpha=32 (= 2×r) follows the QLoRA paper's scaling recommendation.
     model = FastLanguageModel.get_peft_model(
         model,
         r=16,
             "q_proj", "k_proj", "v_proj", "o_proj",
             "gate_proj", "up_proj", "down_proj",
         ],
+        lora_alpha=32,
+        lora_dropout=0.05,
         bias="none",
+        use_gradient_checkpointing="unsloth",  # 30% faster than standard
         random_state=3407,
     )
     return model, tokenizer