repomind-api / fine_tuning /evaluator.py
SouravNath's picture
Initial commit
dc71cad
"""
fine_tuning/evaluator.py
──────────────────────────
Post-training evaluation of the fine-tuned model on SWE-bench Lite.
Evaluation pipeline:
1. Load the fine-tuned LoRA adapter (or merged model)
2. For each test instance:
a. Localise files (Phase 3 pipeline)
b. Generate patch with fine-tuned model
c. Apply patch and run tests in sandbox
d. Record result: resolved / not + failure category
3. Compute aggregate metrics:
- % resolved (primary metric)
- avg_attempts (secondary β€” fine-tuned should need fewer retries)
- token_cost_per_issue (efficiency metric)
4. Ablation table: base GPT-4o vs fine-tuned DeepSeek vs +conformal
Ablation table (expected results from the roadmap):
| Variant | % Resolved | Recall@5 |
|--------------------------|------------|----------|
| Naive GPT-4o baseline | 10–18% | 41% |
| + Graph localisation | 25–28% | 74% |
| + Reflection loop | 30–35% | 74% |
| + DeepSeek fine-tuned | 38–44% | 74% |
"""
from __future__ import annotations
import json
import logging
import time
from dataclasses import dataclass, field, asdict
from pathlib import Path
from typing import Literal, Optional
logger = logging.getLogger(__name__)
# ── Result types ──────────────────────────────────────────────────────────────
@dataclass
class EvalResult:
instance_id: str
repo: str
resolved: bool
attempts: int
elapsed_seconds: float
token_cost: int
patch: str
failure_category: str
model_variant: str
@dataclass
class AblationRow:
"""One row in the ablation table."""
system_variant: str
pct_resolved: float
recall_at_5: float
avg_attempts: float
avg_token_cost: float
n_instances: int
notes: str = ""
def to_markdown_row(self) -> str:
return (
f"| {self.system_variant:<40} "
f"| {self.pct_resolved*100:>6.1f}% "
f"| {self.recall_at_5*100:>6.1f}% "
f"| {self.avg_attempts:>7.2f} "
f"| {self.avg_token_cost:>12,.0f} "
f"| {self.n_instances:>5} |"
)
@dataclass
class EvaluationReport:
variant: str
results: list[EvalResult] = field(default_factory=list)
@property
def n_total(self) -> int:
return len(self.results)
@property
def n_resolved(self) -> int:
return sum(1 for r in self.results if r.resolved)
@property
def pct_resolved(self) -> float:
return self.n_resolved / max(self.n_total, 1)
@property
def avg_attempts(self) -> float:
if not self.results:
return 0.0
return sum(r.attempts for r in self.results) / len(self.results)
@property
def avg_token_cost(self) -> float:
if not self.results:
return 0.0
return sum(r.token_cost for r in self.results) / len(self.results)
@property
def avg_elapsed_seconds(self) -> float:
if not self.results:
return 0.0
return sum(r.elapsed_seconds for r in self.results) / len(self.results)
@property
def failure_breakdown(self) -> dict[str, int]:
breakdown: dict[str, int] = {}
for r in self.results:
breakdown[r.failure_category] = breakdown.get(r.failure_category, 0) + 1
return breakdown
def to_ablation_row(self, recall_at_5: float = 0.0) -> AblationRow:
return AblationRow(
system_variant=self.variant,
pct_resolved=self.pct_resolved,
recall_at_5=recall_at_5,
avg_attempts=self.avg_attempts,
avg_token_cost=self.avg_token_cost,
n_instances=self.n_total,
)
def save(self, path: Path) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps({
"variant": self.variant,
"summary": {
"n_total": self.n_total,
"n_resolved": self.n_resolved,
"pct_resolved": self.pct_resolved,
"avg_attempts": self.avg_attempts,
"avg_token_cost": self.avg_token_cost,
"avg_elapsed_seconds": self.avg_elapsed_seconds,
"failure_breakdown": self.failure_breakdown,
},
"results": [asdict(r) for r in self.results],
}, indent=2))
# ── Ablation table builder ────────────────────────────────────────────────────
class AblationTableBuilder:
"""
Builds the ablation table from multiple EvaluationReport files.
Includes published baselines (Devin, SWE-agent) for comparison.
"""
PUBLISHED_BASELINES = [
AblationRow(
system_variant="SWE-agent (Claude-3.5, published)",
pct_resolved=0.1247,
recall_at_5=0.0,
avg_attempts=1.0,
avg_token_cost=0,
n_instances=300,
notes="Yao et al. 2024",
),
AblationRow(
system_variant="Devin (published)",
pct_resolved=0.1386,
recall_at_5=0.0,
avg_attempts=1.0,
avg_token_cost=0,
n_instances=300,
notes="Cognition AI 2024",
),
]
def __init__(self):
self._rows: list[AblationRow] = list(self.PUBLISHED_BASELINES)
def add_report(self, report: EvaluationReport, recall_at_5: float = 0.0) -> None:
self._rows.append(report.to_ablation_row(recall_at_5))
def add_row(self, row: AblationRow) -> None:
self._rows.append(row)
def to_markdown(self) -> str:
header = (
"| System Variant "
"| Resolved "
"| Recall@5 "
"| Avg Attempts "
"| Avg Token Cost "
"| N |\n"
"|------------------------------------------|"
"----------|"
"----------|"
"--------------|"
"----------------|"
"-----|"
)
rows = "\n".join(r.to_markdown_row() for r in self._rows)
return header + "\n" + rows
def save_markdown(self, path: Path) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(f"# Ablation Results\n\n{self.to_markdown()}\n")
logger.info("Ablation table saved to %s", path)
def save_json(self, path: Path) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps([asdict(r) for r in self._rows], indent=2))
# ── Inference helper ──────────────────────────────────────────────────────────
class FinetunedModelInference:
"""
Wrapper for the fine-tuned DeepSeek-Coder model.
Supports both LoRA adapter and merged model loading.
"""
def __init__(
self,
model_path: str,
use_lora: bool = True,
base_model: str = "deepseek-ai/deepseek-coder-7b-instruct-v1.5",
load_in_4bit: bool = True,
):
self.model_path = model_path
self.use_lora = use_lora
self.base_model = base_model
self.load_in_4bit = load_in_4bit
self._model = None
self._tokenizer = None
def load(self) -> None:
"""Load model into memory (deferred to avoid import at module level)."""
try:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
bnb_cfg = None
if self.load_in_4bit:
bnb_cfg = BitsAndBytesConfig(
load_in_4bit=True, bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
self.base_model if self.use_lora else self.model_path,
quantization_config=bnb_cfg,
device_map="auto",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
)
if self.use_lora:
from peft import PeftModel
model = PeftModel.from_pretrained(model, self.model_path)
model = model.merge_and_unload() # merge for fast inference
self._model = model.eval()
self._tokenizer = AutoTokenizer.from_pretrained(
self.model_path, trust_remote_code=True
)
logger.info("Fine-tuned model loaded from %s", self.model_path)
except ImportError as e:
raise ImportError(
f"Install: pip install transformers peft torch bitsandbytes\n{e}"
)
def generate_patch(self, user_prompt: str, system_prompt: str, max_new_tokens: int = 1024) -> str:
"""Generate a unified diff patch for the given prompt."""
if self._model is None:
self.load()
import torch
from fine_tuning.dataset_builder import CHATML_TEMPLATE
prompt = CHATML_TEMPLATE.format(
system=system_prompt, user=user_prompt, assistant=""
).rstrip()
inputs = self._tokenizer(
prompt, return_tensors="pt", truncation=True, max_length=4096
).to(self._model.device)
with torch.inference_mode():
output = self._model.generate(
**inputs,
max_new_tokens=max_new_tokens,
do_sample=False,
temperature=1.0, # deterministic when do_sample=False
pad_token_id=self._tokenizer.eos_token_id,
)
# Decode only the new tokens (not the prompt)
new_tokens = output[0][inputs["input_ids"].shape[1]:]
patch = self._tokenizer.decode(new_tokens, skip_special_tokens=True)
return patch.strip()
def batch_generate(self, prompts: list[str], system_prompt: str, **kwargs) -> list[str]:
"""Generate patches for a batch of prompts."""
return [self.generate_patch(p, system_prompt, **kwargs) for p in prompts]