Spaces:
Running
Running
File size: 6,062 Bytes
dc71cad | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 | """
fine_tuning/qlora_config.py
ββββββββββββββββββββββββββββ
QLoRA fine-tuning configuration for DeepSeek-Coder-7B.
Architecture choices:
- Base: DeepSeek-Coder-7B-instruct (already instruction-tuned)
- Quantisation: 4-bit NF4 with double quantisation (bitsandbytes)
- LoRA: r=16, alpha=32, dropout=0.05
- Target modules: q_proj, v_proj, k_proj, o_proj, gate_proj, up_proj, down_proj
- Training: 3 epochs, lr=2e-4, batch=4, grad_accum=4 (effective batch=16)
- Sequence length: 4096 tokens (covers most patches + context)
Why these choices:
- r=16: standard for instruction tuning; higher r = more capacity but slower
- alpha=32: alpha/r=2 is the standard scaling factor
- gate/up/down_proj: including MLP layers improves code generation quality
- 4-bit NF4: 4-bit Normal Float β designed for weight distributions
- double quantisation: quantises the quantisation constants too (~0.4 GB saved)
GPU requirements:
- 7B model in 4-bit: ~4.5 GB VRAM
- LoRA adapters: ~120 MB
- Activations + gradients: ~8 GB at seq_len=4096, batch=4
- Total: ~14 GB β fits comfortably on A100-40G or RTX 4090
- RunPod cost: ~$60 for 3 epochs on full SWE-bench Lite dataset
This file: pure dataclasses, no torch/transformers imports at module level.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional
@dataclass
class BitsAndBytesConfig:
"""4-bit quantisation config for bitsandbytes."""
load_in_4bit: bool = True
bnb_4bit_quant_type: str = "nf4" # NF4 > Int4 for weight distributions
bnb_4bit_compute_dtype: str = "bfloat16" # bf16 compute, 4-bit storage
bnb_4bit_use_double_quant: bool = True # saves ~0.4 GB extra
@dataclass
class LoRAConfig:
"""LoRA adapter configuration."""
r: int = 16
lora_alpha: int = 32
lora_dropout: float = 0.05
bias: str = "none"
task_type: str = "CAUSAL_LM"
target_modules: list[str] = field(default_factory=lambda: [
"q_proj", "v_proj", "k_proj", "o_proj", # attention
"gate_proj", "up_proj", "down_proj", # MLP β critical for code gen
])
modules_to_save: list[str] = field(default_factory=list)
@property
def scaling(self) -> float:
return self.lora_alpha / self.r
@dataclass
class TrainingConfig:
"""SFT training hyperparameters."""
# Model
model_name: str = "deepseek-ai/deepseek-coder-7b-instruct-v1.5"
output_dir: str = "results/fine_tuning/checkpoints"
run_name: str = "deepseek-coder-7b-qlora-swe"
# Data
train_file: str = "results/fine_tuning/train.jsonl"
val_file: str = "results/fine_tuning/val.jsonl"
max_seq_length: int = 4096
dataset_text_field: str = "text" # field in JSONL containing ChatML text
packing: bool = False # don't pack β patch sequences vary in length
# Training
num_train_epochs: int = 3
per_device_train_batch_size: int = 4
per_device_eval_batch_size: int = 2
gradient_accumulation_steps: int = 4 # effective batch = 4 * 4 = 16
learning_rate: float = 2e-4
lr_scheduler_type: str = "cosine"
warmup_ratio: float = 0.05
weight_decay: float = 0.01
max_grad_norm: float = 1.0
optim: str = "paged_adamw_32bit" # memory-efficient adamw
# Mixed precision
bf16: bool = True # bfloat16 training
fp16: bool = False
# Saving & logging
save_strategy: str = "steps"
save_steps: int = 100
save_total_limit: int = 3 # keep only 3 best checkpoints
logging_steps: int = 10
eval_strategy: str = "steps"
eval_steps: int = 100
load_best_model_at_end: bool = True
metric_for_best_model: str = "eval_loss"
greater_is_better: bool = False
# MLflow / W&B
report_to: str = "mlflow"
mlflow_experiment_name: str = "deepseek-coder-qlora"
# LoRA + quantisation
lora: LoRAConfig = field(default_factory=LoRAConfig)
bnb: BitsAndBytesConfig = field(default_factory=BitsAndBytesConfig)
# Inference
max_new_tokens: int = 1024
do_sample: bool = False # greedy for deterministic patches
temperature: float = 0.2
@property
def effective_batch_size(self) -> int:
return self.per_device_train_batch_size * self.gradient_accumulation_steps
@property
def output_path(self) -> Path:
return Path(self.output_dir)
def estimate_vram_gb(self) -> float:
"""Rough VRAM estimate in GB."""
model_gb = 4.5 # 7B in 4-bit
lora_gb = 0.12 # LoRA adapters
activations_gb = (
self.per_device_train_batch_size
* self.max_seq_length
* 4096 # hidden dim
* 2 # bf16
/ 1e9
)
return model_gb + lora_gb + activations_gb
# ββ Alternative configs for ablation βββββββββββββββββββββββββββββββββββββββββ
def get_config(variant: str = "default") -> TrainingConfig:
"""
Pre-built configs for ablation experiments.
Variants:
default β standard QLoRA, 3 epochs
small_r β r=8 (less capacity, faster)
large_r β r=32 (more capacity, slower)
no_mlp β skip MLP modules (attention-only LoRA)
longer β 5 epochs (risk of overfitting)
"""
configs = {
"default": TrainingConfig(),
"small_r": TrainingConfig(lora=LoRAConfig(r=8, lora_alpha=16)),
"large_r": TrainingConfig(lora=LoRAConfig(r=32, lora_alpha=64)),
"no_mlp": TrainingConfig(lora=LoRAConfig(target_modules=["q_proj", "v_proj", "k_proj", "o_proj"])),
"longer": TrainingConfig(num_train_epochs=5),
"qwen": TrainingConfig(model_name="Qwen/Qwen2.5-Coder-7B-Instruct"),
}
if variant not in configs:
raise ValueError(f"Unknown variant: {variant}. Choose from {list(configs)}")
return configs[variant]
|