Spaces:

Prasham1710
/

ci-triage-training

Sleeping

Prasham.Jain Claude Sonnet 4.6 commited on about 1 month ago

Commit

ddfe351

1 Parent(s): 68277e2

fix(training): upgrade to torch 2.5.1+cu124, restore unsloth for Qwen3

Root cause of the dependency chain:
torch 2.4 → can't use torchao>0.5 → must pin transformers<4.47
transformers<4.47 → no Qwen3 (qwen3_5 arch added in 4.51)
transformers<4.51 → no CompileConfig → unsloth import fails

Fix: bump base Docker image to pytorch/pytorch:2.5.1-cuda12.4-cudnn9-devel
- torch 2.5.1 is compatible with modern torchao
- unsloth[cu124-torch251] installs transformers>=4.51, peft, trl, xformers
- Qwen3-4B architecture (qwen3_5) now recognized by transformers

sft.py:
- Restore unsloth FastLanguageModel (use_gradient_checkpointing="unsloth")
- MODEL_NAME = "unsloth/Qwen3-4B-bnb-4bit" (pre-quantized, 2x faster load)
- Add bf16=True, dataset_text_field="text" to SFTConfig

pyproject.toml:
- Bump training extras to transformers>=4.51, trl>=0.12, peft>=0.14, torch>=2.5

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (3) hide show

Dockerfile.train +8 -12
pyproject.toml +5 -4
src/ci_triage_env/training/sft.py +21 -35

Dockerfile.train CHANGED Viewed

@@ -9,7 +9,8 @@
 #   HF_SCENARIOS_REPO, HF_SFT_DATASET_REPO, HF_MODEL_REPO (optional)
 #   GRPO_STEPS (optional, default 100)
-FROM pytorch/pytorch:2.4.0-cuda12.1-cudnn9-devel
 ENV DEBIAN_FRONTEND=noninteractive
 ENV PYTHONUNBUFFERED=1
@@ -20,18 +21,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
 WORKDIR /workspace
-# 1. Pin versions compatible with torch 2.4.1 in this image.
-#    torchao latest requires torch>=2.11; transformers>=4.47 pulls torchao as dep.
-#    bitsandbytes replaces unsloth for 4-bit quantisation.
 RUN pip install --no-cache-dir \
-    "torchao==0.5.0" \
-    "transformers==4.46.3" \
-    "trl==0.11.4" \
-    "peft==0.13.2" \
-    "accelerate==0.34.2" \
-    "bitsandbytes>=0.43.0"
-# 2. Install project deps (versions pinned above won't be overridden)
 COPY pyproject.toml README.md ./
 COPY src/ src/
 RUN pip install --no-cache-dir -e ".[data,training]"

 #   HF_SCENARIOS_REPO, HF_SFT_DATASET_REPO, HF_MODEL_REPO (optional)
 #   GRPO_STEPS (optional, default 100)
+# torch 2.5.1 + CUDA 12.4 — minimum needed for unsloth + transformers>=4.51 + Qwen3.
+FROM pytorch/pytorch:2.5.1-cuda12.4-cudnn9-devel
 ENV DEBIAN_FRONTEND=noninteractive
 ENV PYTHONUNBUFFERED=1
 WORKDIR /workspace
+# 1. Install unsloth for this exact torch/CUDA combo.
+#    This resolves and installs compatible versions of:
+#    transformers>=4.51 (Qwen3 + CompileConfig), peft, trl, accelerate, xformers.
 RUN pip install --no-cache-dir \
+    "unsloth[cu124-torch251] @ git+https://github.com/unslothai/unsloth.git"
+# 2. Install project deps (unsloth already locked transformers/trl/peft above).
 COPY pyproject.toml README.md ./
 COPY src/ src/
 RUN pip install --no-cache-dir -e ".[data,training]"

pyproject.toml CHANGED Viewed

@@ -19,10 +19,11 @@ dependencies = [
 [project.optional-dependencies]
 training = [
-    "torch>=2.3",
-    "transformers>=4.45",
-    "trl>=0.11",
-    "accelerate>=0.30",
     "wandb>=0.17",
     "matplotlib>=3.8",
     "seaborn>=0.13",

 [project.optional-dependencies]
 training = [
+    "torch>=2.5",
+    "transformers>=4.51",
+    "trl>=0.12",
+    "peft>=0.14",
+    "accelerate>=0.34",
     "wandb>=0.17",
     "matplotlib>=3.8",
     "seaborn>=0.13",

src/ci_triage_env/training/sft.py CHANGED Viewed

@@ -1,12 +1,14 @@
-"""SFT warmstart trainer — Qwen3-4B + LoRA on the C3 trajectory dataset.
-All GPU-heavy imports (trl, torch, peft) are lazy so the module is
 importable without a GPU for testing.
 """
 from __future__ import annotations
-MODEL_NAME = "Qwen/Qwen3-4B"
 MAX_SEQ_LEN = 8192
@@ -14,48 +16,29 @@ def load_model_for_sft(
     model_name: str = MODEL_NAME,
     max_seq_length: int = MAX_SEQ_LEN,
 ):
-    """Load Qwen3-4B in 4-bit via bitsandbytes + LoRA via PEFT. Requires GPU."""
-    import torch
-    from peft import LoraConfig, TaskType, get_peft_model  # type: ignore[import]
-    from transformers import (  # type: ignore[import]
-        AutoModelForCausalLM,
-        AutoTokenizer,
-        BitsAndBytesConfig,
-    )
-    bnb_config = BitsAndBytesConfig(
         load_in_4bit=True,
-        bnb_4bit_quant_type="nf4",
-        bnb_4bit_compute_dtype=torch.bfloat16,
-        bnb_4bit_use_double_quant=True,
-    )
-    model = AutoModelForCausalLM.from_pretrained(
-        model_name,
-        quantization_config=bnb_config,
-        device_map="auto",
-        trust_remote_code=True,
     )
-    model.gradient_checkpointing_enable()
-    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-    if tokenizer.pad_token is None:
-        tokenizer.pad_token = tokenizer.eos_token
-    tokenizer.model_max_length = max_seq_length
-    lora_config = LoraConfig(
         r=16,
-        lora_alpha=32,
         target_modules=[
             "q_proj", "k_proj", "v_proj", "o_proj",
             "gate_proj", "up_proj", "down_proj",
         ],
-        lora_dropout=0.0,
         bias="none",
-        task_type=TaskType.CAUSAL_LM,
     )
-    model = get_peft_model(model, lora_config)
-    model.print_trainable_parameters()
     return model, tokenizer
@@ -77,7 +60,7 @@ def run_sft(
     gradient_accumulation_steps: int = 4,
     model_name: str = MODEL_NAME,
 ) -> str:
-    """Train the SFT warmstart model. Requires GPU + trl + peft + bitsandbytes.
     Args:
         dataset_path: Path to a HF Dataset saved by trajectory_gen (save_to_disk).
@@ -104,10 +87,13 @@ def run_sft(
         gradient_accumulation_steps=gradient_accumulation_steps,
         learning_rate=2e-5,
         warmup_ratio=0.05,
         logging_steps=10,
         save_steps=100,
         report_to="wandb",
         max_seq_length=MAX_SEQ_LEN,
     )
     trainer = SFTTrainer(
         model=model,

+"""SFT warmstart trainer — Qwen3-4B + LoRA via unsloth.
+All GPU-heavy imports (unsloth, trl, torch) are lazy so the module is
 importable without a GPU for testing.
 """
 from __future__ import annotations
+# unsloth hosts optimised weights; the bnb-4bit variant skips on-the-fly quantisation
+# so it loads ~2x faster than the base float16 weights.
+MODEL_NAME = "unsloth/Qwen3-4B-bnb-4bit"
 MAX_SEQ_LEN = 8192
     model_name: str = MODEL_NAME,
     max_seq_length: int = MAX_SEQ_LEN,
 ):
+    """Load Qwen3-4B with unsloth 4-bit + LoRA. Requires GPU and unsloth installed."""
+    from unsloth import FastLanguageModel  # type: ignore[import]
+    model, tokenizer = FastLanguageModel.from_pretrained(
+        model_name=model_name,
+        max_seq_length=max_seq_length,
         load_in_4bit=True,
+        dtype=None,          # auto — bfloat16 on Ampere+
     )
+    model = FastLanguageModel.get_peft_model(
+        model,
         r=16,
         target_modules=[
             "q_proj", "k_proj", "v_proj", "o_proj",
             "gate_proj", "up_proj", "down_proj",
         ],
+        lora_alpha=16,
+        lora_dropout=0,
         bias="none",
+        use_gradient_checkpointing="unsloth",  # unsloth's gradient checkpointing is 30% faster
+        random_state=3407,
     )
     return model, tokenizer
     gradient_accumulation_steps: int = 4,
     model_name: str = MODEL_NAME,
 ) -> str:
+    """Train the SFT warmstart model. Requires GPU + unsloth + trl installed.
     Args:
         dataset_path: Path to a HF Dataset saved by trajectory_gen (save_to_disk).
         gradient_accumulation_steps=gradient_accumulation_steps,
         learning_rate=2e-5,
         warmup_ratio=0.05,
+        bf16=True,
+        fp16=False,
         logging_steps=10,
         save_steps=100,
         report_to="wandb",
         max_seq_length=MAX_SEQ_LEN,
+        dataset_text_field="text",
     )
     trainer = SFTTrainer(
         model=model,