"""
fine_tuning/qlora_config.py
────────────────────────────
QLoRA fine-tuning configuration for DeepSeek-Coder-7B.

Architecture choices:
  - Base: DeepSeek-Coder-7B-instruct (already instruction-tuned)
  - Quantisation: 4-bit NF4 with double quantisation (bitsandbytes)
  - LoRA: r=16, alpha=32, dropout=0.05
  - Target modules: q_proj, v_proj, k_proj, o_proj, gate_proj, up_proj, down_proj
  - Training: 3 epochs, lr=2e-4, batch=4, grad_accum=4 (effective batch=16)
  - Sequence length: 4096 tokens (covers most patches + context)

Why these choices:
  - r=16: standard for instruction tuning; higher r = more capacity but slower
  - alpha=32: alpha/r=2 is the standard scaling factor
  - gate/up/down_proj: including MLP layers improves code generation quality
  - 4-bit NF4: 4-bit Normal Float — designed for weight distributions
  - double quantisation: quantises the quantisation constants too (~0.4 GB saved)

GPU requirements:
  - 7B model in 4-bit: ~4.5 GB VRAM
  - LoRA adapters: ~120 MB
  - Activations + gradients: ~8 GB at seq_len=4096, batch=4
  - Total: ~14 GB — fits comfortably on A100-40G or RTX 4090
  - RunPod cost: ~$60 for 3 epochs on full SWE-bench Lite dataset

This file: pure dataclasses, no torch/transformers imports at module level.
"""
from __future__ import annotations

from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional


@dataclass
class BitsAndBytesConfig:
    """4-bit quantisation config for bitsandbytes."""
    load_in_4bit: bool = True
    bnb_4bit_quant_type: str = "nf4"           # NF4 > Int4 for weight distributions
    bnb_4bit_compute_dtype: str = "bfloat16"   # bf16 compute, 4-bit storage
    bnb_4bit_use_double_quant: bool = True      # saves ~0.4 GB extra


@dataclass
class LoRAConfig:
    """LoRA adapter configuration."""
    r: int = 16
    lora_alpha: int = 32
    lora_dropout: float = 0.05
    bias: str = "none"
    task_type: str = "CAUSAL_LM"
    target_modules: list[str] = field(default_factory=lambda: [
        "q_proj", "v_proj", "k_proj", "o_proj",   # attention
        "gate_proj", "up_proj", "down_proj",        # MLP — critical for code gen
    ])
    modules_to_save: list[str] = field(default_factory=list)

    @property
    def scaling(self) -> float:
        return self.lora_alpha / self.r


@dataclass
class TrainingConfig:
    """SFT training hyperparameters."""
    # Model
    model_name: str = "deepseek-ai/deepseek-coder-7b-instruct-v1.5"
    output_dir: str = "results/fine_tuning/checkpoints"
    run_name: str = "deepseek-coder-7b-qlora-swe"

    # Data
    train_file: str = "results/fine_tuning/train.jsonl"
    val_file: str = "results/fine_tuning/val.jsonl"
    max_seq_length: int = 4096
    dataset_text_field: str = "text"      # field in JSONL containing ChatML text
    packing: bool = False                  # don't pack — patch sequences vary in length

    # Training
    num_train_epochs: int = 3
    per_device_train_batch_size: int = 4
    per_device_eval_batch_size: int = 2
    gradient_accumulation_steps: int = 4  # effective batch = 4 * 4 = 16
    learning_rate: float = 2e-4
    lr_scheduler_type: str = "cosine"
    warmup_ratio: float = 0.05
    weight_decay: float = 0.01
    max_grad_norm: float = 1.0
    optim: str = "paged_adamw_32bit"      # memory-efficient adamw

    # Mixed precision
    bf16: bool = True    # bfloat16 training
    fp16: bool = False

    # Saving & logging
    save_strategy: str = "steps"
    save_steps: int = 100
    save_total_limit: int = 3             # keep only 3 best checkpoints
    logging_steps: int = 10
    eval_strategy: str = "steps"
    eval_steps: int = 100
    load_best_model_at_end: bool = True
    metric_for_best_model: str = "eval_loss"
    greater_is_better: bool = False

    # MLflow / W&B
    report_to: str = "mlflow"
    mlflow_experiment_name: str = "deepseek-coder-qlora"

    # LoRA + quantisation
    lora: LoRAConfig = field(default_factory=LoRAConfig)
    bnb: BitsAndBytesConfig = field(default_factory=BitsAndBytesConfig)

    # Inference
    max_new_tokens: int = 1024
    do_sample: bool = False    # greedy for deterministic patches
    temperature: float = 0.2

    @property
    def effective_batch_size(self) -> int:
        return self.per_device_train_batch_size * self.gradient_accumulation_steps

    @property
    def output_path(self) -> Path:
        return Path(self.output_dir)

    def estimate_vram_gb(self) -> float:
        """Rough VRAM estimate in GB."""
        model_gb = 4.5    # 7B in 4-bit
        lora_gb = 0.12    # LoRA adapters
        activations_gb = (
            self.per_device_train_batch_size
            * self.max_seq_length
            * 4096   # hidden dim
            * 2      # bf16
            / 1e9
        )
        return model_gb + lora_gb + activations_gb


# ── Alternative configs for ablation ─────────────────────────────────────────

def get_config(variant: str = "default") -> TrainingConfig:
    """
    Pre-built configs for ablation experiments.

    Variants:
        default     — standard QLoRA, 3 epochs
        small_r     — r=8 (less capacity, faster)
        large_r     — r=32 (more capacity, slower)
        no_mlp      — skip MLP modules (attention-only LoRA)
        longer      — 5 epochs (risk of overfitting)
    """
    configs = {
        "default": TrainingConfig(),
        "small_r": TrainingConfig(lora=LoRAConfig(r=8, lora_alpha=16)),
        "large_r": TrainingConfig(lora=LoRAConfig(r=32, lora_alpha=64)),
        "no_mlp":  TrainingConfig(lora=LoRAConfig(target_modules=["q_proj", "v_proj", "k_proj", "o_proj"])),
        "longer":  TrainingConfig(num_train_epochs=5),
        "qwen":    TrainingConfig(model_name="Qwen/Qwen2.5-Coder-7B-Instruct"),
    }
    if variant not in configs:
        raise ValueError(f"Unknown variant: {variant}. Choose from {list(configs)}")
    return configs[variant]