Spaces:

SouravNath
/

repomind-api

Running

File size: 10,466 Bytes

dc71cad

"""
fine_tuning/evaluator.py
──────────────────────────
Post-training evaluation of the fine-tuned model on SWE-bench Lite.

Evaluation pipeline:
  1. Load the fine-tuned LoRA adapter (or merged model)
  2. For each test instance:
       a. Localise files (Phase 3 pipeline)
       b. Generate patch with fine-tuned model
       c. Apply patch and run tests in sandbox
       d. Record result: resolved / not + failure category
  3. Compute aggregate metrics:
       - % resolved (primary metric)
       - avg_attempts (secondary — fine-tuned should need fewer retries)
       - token_cost_per_issue (efficiency metric)
  4. Ablation table: base GPT-4o vs fine-tuned DeepSeek vs +conformal

Ablation table (expected results from the roadmap):
  | Variant                  | % Resolved | Recall@5 |
  |--------------------------|------------|----------|
  | Naive GPT-4o baseline    | 10–18%     | 41%      |
  | + Graph localisation     | 25–28%     | 74%      |
  | + Reflection loop        | 30–35%     | 74%      |
  | + DeepSeek fine-tuned    | 38–44%     | 74%      |
"""
from __future__ import annotations

import json
import logging
import time
from dataclasses import dataclass, field, asdict
from pathlib import Path
from typing import Literal, Optional

logger = logging.getLogger(__name__)


# ── Result types ──────────────────────────────────────────────────────────────

@dataclass
class EvalResult:
    instance_id: str
    repo: str
    resolved: bool
    attempts: int
    elapsed_seconds: float
    token_cost: int
    patch: str
    failure_category: str
    model_variant: str


@dataclass
class AblationRow:
    """One row in the ablation table."""
    system_variant: str
    pct_resolved: float
    recall_at_5: float
    avg_attempts: float
    avg_token_cost: float
    n_instances: int
    notes: str = ""

    def to_markdown_row(self) -> str:
        return (
            f"| {self.system_variant:<40} "
            f"| {self.pct_resolved*100:>6.1f}% "
            f"| {self.recall_at_5*100:>6.1f}% "
            f"| {self.avg_attempts:>7.2f} "
            f"| {self.avg_token_cost:>12,.0f} "
            f"| {self.n_instances:>5} |"
        )


@dataclass
class EvaluationReport:
    variant: str
    results: list[EvalResult] = field(default_factory=list)

    @property
    def n_total(self) -> int:
        return len(self.results)

    @property
    def n_resolved(self) -> int:
        return sum(1 for r in self.results if r.resolved)

    @property
    def pct_resolved(self) -> float:
        return self.n_resolved / max(self.n_total, 1)

    @property
    def avg_attempts(self) -> float:
        if not self.results:
            return 0.0
        return sum(r.attempts for r in self.results) / len(self.results)

    @property
    def avg_token_cost(self) -> float:
        if not self.results:
            return 0.0
        return sum(r.token_cost for r in self.results) / len(self.results)

    @property
    def avg_elapsed_seconds(self) -> float:
        if not self.results:
            return 0.0
        return sum(r.elapsed_seconds for r in self.results) / len(self.results)

    @property
    def failure_breakdown(self) -> dict[str, int]:
        breakdown: dict[str, int] = {}
        for r in self.results:
            breakdown[r.failure_category] = breakdown.get(r.failure_category, 0) + 1
        return breakdown

    def to_ablation_row(self, recall_at_5: float = 0.0) -> AblationRow:
        return AblationRow(
            system_variant=self.variant,
            pct_resolved=self.pct_resolved,
            recall_at_5=recall_at_5,
            avg_attempts=self.avg_attempts,
            avg_token_cost=self.avg_token_cost,
            n_instances=self.n_total,
        )

    def save(self, path: Path) -> None:
        path.parent.mkdir(parents=True, exist_ok=True)
        path.write_text(json.dumps({
            "variant": self.variant,
            "summary": {
                "n_total": self.n_total,
                "n_resolved": self.n_resolved,
                "pct_resolved": self.pct_resolved,
                "avg_attempts": self.avg_attempts,
                "avg_token_cost": self.avg_token_cost,
                "avg_elapsed_seconds": self.avg_elapsed_seconds,
                "failure_breakdown": self.failure_breakdown,
            },
            "results": [asdict(r) for r in self.results],
        }, indent=2))


# ── Ablation table builder ────────────────────────────────────────────────────

class AblationTableBuilder:
    """
    Builds the ablation table from multiple EvaluationReport files.
    Includes published baselines (Devin, SWE-agent) for comparison.
    """

    PUBLISHED_BASELINES = [
        AblationRow(
            system_variant="SWE-agent (Claude-3.5, published)",
            pct_resolved=0.1247,
            recall_at_5=0.0,
            avg_attempts=1.0,
            avg_token_cost=0,
            n_instances=300,
            notes="Yao et al. 2024",
        ),
        AblationRow(
            system_variant="Devin (published)",
            pct_resolved=0.1386,
            recall_at_5=0.0,
            avg_attempts=1.0,
            avg_token_cost=0,
            n_instances=300,
            notes="Cognition AI 2024",
        ),
    ]

    def __init__(self):
        self._rows: list[AblationRow] = list(self.PUBLISHED_BASELINES)

    def add_report(self, report: EvaluationReport, recall_at_5: float = 0.0) -> None:
        self._rows.append(report.to_ablation_row(recall_at_5))

    def add_row(self, row: AblationRow) -> None:
        self._rows.append(row)

    def to_markdown(self) -> str:
        header = (
            "| System Variant                           "
            "| Resolved "
            "| Recall@5 "
            "| Avg Attempts "
            "| Avg Token Cost "
            "| N |\n"
            "|------------------------------------------|"
            "----------|"
            "----------|"
            "--------------|"
            "----------------|"
            "-----|"
        )
        rows = "\n".join(r.to_markdown_row() for r in self._rows)
        return header + "\n" + rows

    def save_markdown(self, path: Path) -> None:
        path.parent.mkdir(parents=True, exist_ok=True)
        path.write_text(f"# Ablation Results\n\n{self.to_markdown()}\n")
        logger.info("Ablation table saved to %s", path)

    def save_json(self, path: Path) -> None:
        path.parent.mkdir(parents=True, exist_ok=True)
        path.write_text(json.dumps([asdict(r) for r in self._rows], indent=2))


# ── Inference helper ──────────────────────────────────────────────────────────

class FinetunedModelInference:
    """
    Wrapper for the fine-tuned DeepSeek-Coder model.
    Supports both LoRA adapter and merged model loading.
    """

    def __init__(
        self,
        model_path: str,
        use_lora: bool = True,
        base_model: str = "deepseek-ai/deepseek-coder-7b-instruct-v1.5",
        load_in_4bit: bool = True,
    ):
        self.model_path = model_path
        self.use_lora = use_lora
        self.base_model = base_model
        self.load_in_4bit = load_in_4bit
        self._model = None
        self._tokenizer = None

    def load(self) -> None:
        """Load model into memory (deferred to avoid import at module level)."""
        try:
            import torch
            from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

            bnb_cfg = None
            if self.load_in_4bit:
                bnb_cfg = BitsAndBytesConfig(
                    load_in_4bit=True, bnb_4bit_quant_type="nf4",
                    bnb_4bit_compute_dtype=torch.bfloat16,
                    bnb_4bit_use_double_quant=True,
                )

            model = AutoModelForCausalLM.from_pretrained(
                self.base_model if self.use_lora else self.model_path,
                quantization_config=bnb_cfg,
                device_map="auto",
                trust_remote_code=True,
                torch_dtype=torch.bfloat16,
            )

            if self.use_lora:
                from peft import PeftModel
                model = PeftModel.from_pretrained(model, self.model_path)
                model = model.merge_and_unload()  # merge for fast inference

            self._model = model.eval()
            self._tokenizer = AutoTokenizer.from_pretrained(
                self.model_path, trust_remote_code=True
            )
            logger.info("Fine-tuned model loaded from %s", self.model_path)

        except ImportError as e:
            raise ImportError(
                f"Install: pip install transformers peft torch bitsandbytes\n{e}"
            )

    def generate_patch(self, user_prompt: str, system_prompt: str, max_new_tokens: int = 1024) -> str:
        """Generate a unified diff patch for the given prompt."""
        if self._model is None:
            self.load()

        import torch
        from fine_tuning.dataset_builder import CHATML_TEMPLATE

        prompt = CHATML_TEMPLATE.format(
            system=system_prompt, user=user_prompt, assistant=""
        ).rstrip()

        inputs = self._tokenizer(
            prompt, return_tensors="pt", truncation=True, max_length=4096
        ).to(self._model.device)

        with torch.inference_mode():
            output = self._model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=False,
                temperature=1.0,      # deterministic when do_sample=False
                pad_token_id=self._tokenizer.eos_token_id,
            )

        # Decode only the new tokens (not the prompt)
        new_tokens = output[0][inputs["input_ids"].shape[1]:]
        patch = self._tokenizer.decode(new_tokens, skip_special_tokens=True)
        return patch.strip()

    def batch_generate(self, prompts: list[str], system_prompt: str, **kwargs) -> list[str]:
        """Generate patches for a batch of prompts."""
        return [self.generate_patch(p, system_prompt, **kwargs) for p in prompts]