File size: 4,547 Bytes
"""
lora_plan.py + distill.py — Adapter training plans and distillation runner.

These are DRY-RUN planning modules. They produce configs and plans
but do NOT execute training (that requires purpose-agent[train] extra).

Key rule: no distillation without eval data AND ROI check.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any, Literal


@dataclass
class DistillationPlan:
    """
    Plan for creating a task-native mini-model or adapter.
    
    Modes:
      none           → no optimization needed (performance is fine)
      prompt_pack    → optimize prompts only (cheapest)
      lora           → LoRA/QLoRA adapter on base model
      distill        → full knowledge distillation teacher→student
      pdq            → prune → distill → quantize (maximum compression)
    """
    mode: Literal["none", "prompt_pack", "lora", "distill", "pdq"] = "none"
    teacher_model: str = ""
    student_base: str = ""
    dataset_path: str = ""
    eval_path: str = ""
    target_format: str = "gguf"
    acceptance_score: float = 0.9     # Candidate must achieve this on eval
    rollback_model: str = ""          # What to revert to if candidate fails
    estimated_cost_usd: float = 0.0
    estimated_time_hours: float = 0.0
    reason: str = ""
    metadata: dict[str, Any] = field(default_factory=dict)

    @property
    def requires_gpu(self) -> bool:
        return self.mode in ("lora", "distill", "pdq")

    @property
    def requires_train_extra(self) -> bool:
        return self.mode in ("lora", "distill", "pdq")

    def to_dict(self) -> dict[str, Any]:
        return {
            "mode": self.mode, "teacher_model": self.teacher_model,
            "student_base": self.student_base, "dataset_path": self.dataset_path,
            "eval_path": self.eval_path, "target_format": self.target_format,
            "acceptance_score": self.acceptance_score, "rollback_model": self.rollback_model,
            "estimated_cost_usd": self.estimated_cost_usd,
            "estimated_time_hours": self.estimated_time_hours,
            "reason": self.reason, "requires_gpu": self.requires_gpu,
        }

    def summary(self) -> str:
        if self.mode == "none":
            return "No optimization needed."
        return (
            f"Plan: {self.mode}\n"
            f"  Teacher: {self.teacher_model}\n"
            f"  Student: {self.student_base}\n"
            f"  Dataset: {self.dataset_path}\n"
            f"  Acceptance: {self.acceptance_score:.0%}\n"
            f"  Est. cost: ${self.estimated_cost_usd:.2f}\n"
            f"  Est. time: {self.estimated_time_hours:.1f}h\n"
            f"  Reason: {self.reason}"
        )


def plan_distillation(
    fingerprint: dict[str, Any],
    dataset_size: int,
    current_model: str = "",
    target_model: str = "",
    has_gpu: bool = False,
) -> DistillationPlan:
    """
    Create a distillation plan based on capability fingerprint and available resources.
    
    Rules:
      - No dataset → prompt_pack only
      - Dataset < 100 examples → prompt_pack only
      - Dataset 100-1000 + GPU → LoRA
      - Dataset > 1000 + GPU → full distill
      - No GPU → prompt_pack regardless
    """
    if dataset_size < 10:
        return DistillationPlan(mode="none", reason="Insufficient data for any optimization")

    if dataset_size < 100 or not has_gpu:
        return DistillationPlan(
            mode="prompt_pack",
            teacher_model=current_model,
            reason=f"{'No GPU available' if not has_gpu else 'Dataset too small for weight updates'} → prompt optimization only",
            estimated_cost_usd=0.0,
            estimated_time_hours=0.01,
        )

    if dataset_size < 1000:
        return DistillationPlan(
            mode="lora",
            teacher_model=current_model,
            student_base=target_model or current_model,
            reason=f"Dataset ({dataset_size} examples) suitable for LoRA adapter",
            estimated_cost_usd=2.0,
            estimated_time_hours=1.0,
            acceptance_score=0.9,
            rollback_model=current_model,
        )

    return DistillationPlan(
        mode="distill",
        teacher_model=current_model,
        student_base=target_model or "Qwen/Qwen2.5-1.5B-Instruct",
        reason=f"Large dataset ({dataset_size} examples) → full distillation viable",
        estimated_cost_usd=10.0,
        estimated_time_hours=4.0,
        acceptance_score=0.85,
        rollback_model=current_model,
    )