File size: 6,062 Bytes
dc71cad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
"""
fine_tuning/qlora_config.py
────────────────────────────
QLoRA fine-tuning configuration for DeepSeek-Coder-7B.

Architecture choices:
  - Base: DeepSeek-Coder-7B-instruct (already instruction-tuned)
  - Quantisation: 4-bit NF4 with double quantisation (bitsandbytes)
  - LoRA: r=16, alpha=32, dropout=0.05
  - Target modules: q_proj, v_proj, k_proj, o_proj, gate_proj, up_proj, down_proj
  - Training: 3 epochs, lr=2e-4, batch=4, grad_accum=4 (effective batch=16)
  - Sequence length: 4096 tokens (covers most patches + context)

Why these choices:
  - r=16: standard for instruction tuning; higher r = more capacity but slower
  - alpha=32: alpha/r=2 is the standard scaling factor
  - gate/up/down_proj: including MLP layers improves code generation quality
  - 4-bit NF4: 4-bit Normal Float β€” designed for weight distributions
  - double quantisation: quantises the quantisation constants too (~0.4 GB saved)

GPU requirements:
  - 7B model in 4-bit: ~4.5 GB VRAM
  - LoRA adapters: ~120 MB
  - Activations + gradients: ~8 GB at seq_len=4096, batch=4
  - Total: ~14 GB β€” fits comfortably on A100-40G or RTX 4090
  - RunPod cost: ~$60 for 3 epochs on full SWE-bench Lite dataset

This file: pure dataclasses, no torch/transformers imports at module level.
"""
from __future__ import annotations

from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional


@dataclass
class BitsAndBytesConfig:
    """4-bit quantisation config for bitsandbytes."""
    load_in_4bit: bool = True
    bnb_4bit_quant_type: str = "nf4"           # NF4 > Int4 for weight distributions
    bnb_4bit_compute_dtype: str = "bfloat16"   # bf16 compute, 4-bit storage
    bnb_4bit_use_double_quant: bool = True      # saves ~0.4 GB extra


@dataclass
class LoRAConfig:
    """LoRA adapter configuration."""
    r: int = 16
    lora_alpha: int = 32
    lora_dropout: float = 0.05
    bias: str = "none"
    task_type: str = "CAUSAL_LM"
    target_modules: list[str] = field(default_factory=lambda: [
        "q_proj", "v_proj", "k_proj", "o_proj",   # attention
        "gate_proj", "up_proj", "down_proj",        # MLP β€” critical for code gen
    ])
    modules_to_save: list[str] = field(default_factory=list)

    @property
    def scaling(self) -> float:
        return self.lora_alpha / self.r


@dataclass
class TrainingConfig:
    """SFT training hyperparameters."""
    # Model
    model_name: str = "deepseek-ai/deepseek-coder-7b-instruct-v1.5"
    output_dir: str = "results/fine_tuning/checkpoints"
    run_name: str = "deepseek-coder-7b-qlora-swe"

    # Data
    train_file: str = "results/fine_tuning/train.jsonl"
    val_file: str = "results/fine_tuning/val.jsonl"
    max_seq_length: int = 4096
    dataset_text_field: str = "text"      # field in JSONL containing ChatML text
    packing: bool = False                  # don't pack β€” patch sequences vary in length

    # Training
    num_train_epochs: int = 3
    per_device_train_batch_size: int = 4
    per_device_eval_batch_size: int = 2
    gradient_accumulation_steps: int = 4  # effective batch = 4 * 4 = 16
    learning_rate: float = 2e-4
    lr_scheduler_type: str = "cosine"
    warmup_ratio: float = 0.05
    weight_decay: float = 0.01
    max_grad_norm: float = 1.0
    optim: str = "paged_adamw_32bit"      # memory-efficient adamw

    # Mixed precision
    bf16: bool = True    # bfloat16 training
    fp16: bool = False

    # Saving & logging
    save_strategy: str = "steps"
    save_steps: int = 100
    save_total_limit: int = 3             # keep only 3 best checkpoints
    logging_steps: int = 10
    eval_strategy: str = "steps"
    eval_steps: int = 100
    load_best_model_at_end: bool = True
    metric_for_best_model: str = "eval_loss"
    greater_is_better: bool = False

    # MLflow / W&B
    report_to: str = "mlflow"
    mlflow_experiment_name: str = "deepseek-coder-qlora"

    # LoRA + quantisation
    lora: LoRAConfig = field(default_factory=LoRAConfig)
    bnb: BitsAndBytesConfig = field(default_factory=BitsAndBytesConfig)

    # Inference
    max_new_tokens: int = 1024
    do_sample: bool = False    # greedy for deterministic patches
    temperature: float = 0.2

    @property
    def effective_batch_size(self) -> int:
        return self.per_device_train_batch_size * self.gradient_accumulation_steps

    @property
    def output_path(self) -> Path:
        return Path(self.output_dir)

    def estimate_vram_gb(self) -> float:
        """Rough VRAM estimate in GB."""
        model_gb = 4.5    # 7B in 4-bit
        lora_gb = 0.12    # LoRA adapters
        activations_gb = (
            self.per_device_train_batch_size
            * self.max_seq_length
            * 4096   # hidden dim
            * 2      # bf16
            / 1e9
        )
        return model_gb + lora_gb + activations_gb


# ── Alternative configs for ablation ─────────────────────────────────────────

def get_config(variant: str = "default") -> TrainingConfig:
    """
    Pre-built configs for ablation experiments.

    Variants:
        default     β€” standard QLoRA, 3 epochs
        small_r     β€” r=8 (less capacity, faster)
        large_r     β€” r=32 (more capacity, slower)
        no_mlp      β€” skip MLP modules (attention-only LoRA)
        longer      β€” 5 epochs (risk of overfitting)
    """
    configs = {
        "default": TrainingConfig(),
        "small_r": TrainingConfig(lora=LoRAConfig(r=8, lora_alpha=16)),
        "large_r": TrainingConfig(lora=LoRAConfig(r=32, lora_alpha=64)),
        "no_mlp":  TrainingConfig(lora=LoRAConfig(target_modules=["q_proj", "v_proj", "k_proj", "o_proj"])),
        "longer":  TrainingConfig(num_train_epochs=5),
        "qwen":    TrainingConfig(model_name="Qwen/Qwen2.5-Coder-7B-Instruct"),
    }
    if variant not in configs:
        raise ValueError(f"Unknown variant: {variant}. Choose from {list(configs)}")
    return configs[variant]