repomind-api / tests /test_phase7_finetuning.py
SouravNath's picture
Initial commit
dc71cad
"""
tests/test_phase7_finetuning.py
────────────────────────────────
Unit tests for Phase 7: dataset builder, QLoRA config, and evaluator.
All tests run without GPU, model download, or real trajectory files.
Run with: pytest tests/test_phase7_finetuning.py -v
"""
from __future__ import annotations
import json
import tempfile
from dataclasses import asdict
from pathlib import Path
import pytest
# ── Helpers ───────────────────────────────────────────────────────────────────
def make_trajectory_entry(
resolved: bool = True,
category: str = "assertion_error",
patch: str = "--- a/foo.py\n+++ b/foo.py\n@@ -1 +1 @@\n-bug\n+fix\n",
problem: str = "Fix the null pointer error in the queryset filter method call",
attempt: int = 1,
instance_id: str = "django__django-123",
) -> dict:
return {
"instance_id": instance_id,
"repo": "django/django",
"attempt": attempt,
"patch": patch,
"test_stdout": "AssertionError: expected True got False",
"fail_to_pass_results": {"tests::test_x": resolved},
"pass_to_pass_results": {},
"resolved": resolved,
"failure_category": category,
"elapsed_seconds": 5.2,
"token_cost": {"total_tokens": 1500},
"localised_files": ["django/db/models/query.py"],
"problem_statement": problem,
"timestamp": "2025-05-01T00:00:00+00:00",
}
def write_trajectory_jsonl(tmp_path: Path, entries: list[dict]) -> Path:
"""Write trajectory entries to a JSONL file."""
p = tmp_path / "trajectories" / "test.jsonl"
p.parent.mkdir(parents=True, exist_ok=True)
with p.open("w") as f:
for e in entries:
f.write(json.dumps(e) + "\n")
return p
# ── QLoRA Config ──────────────────────────────────────────────────────────────
class TestQLoRAConfig:
def test_default_config(self):
from fine_tuning.qlora_config import TrainingConfig
cfg = TrainingConfig()
assert cfg.model_name == "deepseek-ai/deepseek-coder-7b-instruct-v1.5"
assert cfg.lora.r == 16
assert cfg.lora.lora_alpha == 32
def test_lora_scaling(self):
from fine_tuning.qlora_config import LoRAConfig
lora = LoRAConfig(r=16, lora_alpha=32)
assert lora.scaling == 2.0 # 32/16
def test_effective_batch_size(self):
from fine_tuning.qlora_config import TrainingConfig
cfg = TrainingConfig(
per_device_train_batch_size=4,
gradient_accumulation_steps=4,
)
assert cfg.effective_batch_size == 16
def test_lora_targets_include_mlp(self):
from fine_tuning.qlora_config import LoRAConfig
lora = LoRAConfig()
assert "gate_proj" in lora.target_modules
assert "up_proj" in lora.target_modules
assert "down_proj" in lora.target_modules
def test_bnb_config_defaults(self):
from fine_tuning.qlora_config import BitsAndBytesConfig
bnb = BitsAndBytesConfig()
assert bnb.load_in_4bit is True
assert bnb.bnb_4bit_quant_type == "nf4"
assert bnb.bnb_4bit_use_double_quant is True
def test_vram_estimate_positive(self):
from fine_tuning.qlora_config import TrainingConfig
cfg = TrainingConfig()
assert cfg.estimate_vram_gb() > 4.0 # at least model size
def test_get_config_variants(self):
from fine_tuning.qlora_config import get_config
for variant in ["default", "small_r", "large_r", "no_mlp", "longer", "qwen"]:
cfg = get_config(variant)
assert cfg.model_name is not None
def test_get_config_invalid_raises(self):
from fine_tuning.qlora_config import get_config
with pytest.raises(ValueError, match="Unknown variant"):
get_config("nonexistent_variant")
def test_small_r_has_lower_r(self):
from fine_tuning.qlora_config import get_config
default_cfg = get_config("default")
small_r_cfg = get_config("small_r")
assert small_r_cfg.lora.r < default_cfg.lora.r
def test_output_path_is_path(self):
from fine_tuning.qlora_config import TrainingConfig
cfg = TrainingConfig()
assert isinstance(cfg.output_path, Path)
# ── Training Pair formatting ──────────────────────────────────────────────────
class TestTrainingPair:
def _make_pair(self):
from fine_tuning.dataset_builder import TrainingPair
return TrainingPair(
system="You are an engineer.",
user="Fix the bug:\n## Issue\nDescription",
assistant="--- a/foo.py\n+++ b/foo.py\n",
metadata={"instance_id": "test-1"},
)
def test_to_chatml_format(self):
pair = self._make_pair()
chatml = pair.to_chatml()
assert "<|im_start|>system" in chatml
assert "<|im_start|>user" in chatml
assert "<|im_start|>assistant" in chatml
assert "<|im_end|>" in chatml
def test_to_alpaca_format(self):
pair = self._make_pair()
alpaca = pair.to_alpaca()
assert "instruction" in alpaca
assert "output" in alpaca
assert alpaca["output"] == "--- a/foo.py\n+++ b/foo.py\n"
def test_to_sharegpt_format(self):
pair = self._make_pair()
sg = pair.to_sharegpt()
assert "conversations" in sg
roles = [c["from"] for c in sg["conversations"]]
assert roles == ["system", "human", "gpt"]
def test_to_openai_format(self):
pair = self._make_pair()
oai = pair.to_openai()
assert "messages" in oai
roles = [m["role"] for m in oai["messages"]]
assert roles == ["system", "user", "assistant"]
def test_chatml_contains_content(self):
pair = self._make_pair()
chatml = pair.to_chatml()
assert "You are an engineer" in chatml
assert "Fix the bug" in chatml
assert "--- a/foo.py" in chatml
# ── Dataset Builder ───────────────────────────────────────────────────────────
class TestFinetuningDatasetBuilder:
def _make_builder(self, tmp_path):
from fine_tuning.dataset_builder import FinetuningDatasetBuilder
return FinetuningDatasetBuilder(
trajectory_dir=tmp_path / "trajectories",
output_dir=tmp_path / "output",
val_fraction=0.2,
min_problem_words=5, # relaxed for testing
)
def _populate_trajectories(self, tmp_path, entries: list[dict]) -> Path:
return write_trajectory_jsonl(tmp_path, entries)
def test_empty_trajectory_dir(self, tmp_path):
from fine_tuning.dataset_builder import FinetuningDatasetBuilder
builder = FinetuningDatasetBuilder(
trajectory_dir=tmp_path / "nonexistent",
output_dir=tmp_path / "out",
)
stats = builder.build()
assert stats.total_trajectories == 0
assert stats.train_size == 0
def test_builds_from_valid_trajectories(self, tmp_path):
entries = [make_trajectory_entry(resolved=True) for _ in range(10)]
self._populate_trajectories(tmp_path, entries)
builder = self._make_builder(tmp_path)
stats = builder.build(include_reflection_pairs=False)
assert stats.total_trajectories == 10
assert stats.train_size + stats.val_size > 0
def test_filters_unknown_category(self, tmp_path):
entries = [
make_trajectory_entry(category="assertion_error"),
make_trajectory_entry(category="unknown"), # should be filtered
]
self._populate_trajectories(tmp_path, entries)
builder = self._make_builder(tmp_path)
stats = builder.build(include_reflection_pairs=False)
assert stats.filter_reasons.get("unknown_category", 0) >= 1
def test_filters_empty_patch(self, tmp_path):
entries = [make_trajectory_entry(patch="")]
self._populate_trajectories(tmp_path, entries)
builder = self._make_builder(tmp_path)
stats = builder.build(include_reflection_pairs=False)
assert stats.filter_reasons.get("empty_patch", 0) >= 1
def test_filters_invalid_patch_format(self, tmp_path):
entries = [make_trajectory_entry(patch="just some text")]
self._populate_trajectories(tmp_path, entries)
builder = self._make_builder(tmp_path)
stats = builder.build(include_reflection_pairs=False)
assert stats.filter_reasons.get("invalid_patch_format", 0) >= 1
def test_train_val_split(self, tmp_path):
entries = [make_trajectory_entry() for _ in range(20)]
self._populate_trajectories(tmp_path, entries)
builder = self._make_builder(tmp_path)
stats = builder.build(include_reflection_pairs=False)
# val should be ~20% of (train + val)
total = stats.train_size + stats.val_size
assert total > 0
val_ratio = stats.val_size / total
assert 0.05 < val_ratio < 0.50 # flexible for small datasets
def test_output_files_created(self, tmp_path):
entries = [make_trajectory_entry() for _ in range(5)]
self._populate_trajectories(tmp_path, entries)
builder = self._make_builder(tmp_path)
builder.build(include_reflection_pairs=False)
assert (tmp_path / "output" / "train.jsonl").exists()
assert (tmp_path / "output" / "val.jsonl").exists()
assert (tmp_path / "output" / "dataset_stats.json").exists()
def test_chatml_format_output(self, tmp_path):
entries = [make_trajectory_entry() for _ in range(5)]
self._populate_trajectories(tmp_path, entries)
builder = self._make_builder(tmp_path)
builder.build(format="chatml", include_reflection_pairs=False)
train_path = tmp_path / "output" / "train.jsonl"
if train_path.exists() and train_path.stat().st_size > 0:
with train_path.open() as f:
first = json.loads(f.readline())
assert "text" in first
assert "<|im_start|>" in first["text"]
def test_reflection_pairs_from_multi_attempt(self, tmp_path):
"""Multi-attempt instances should generate reflection pairs."""
entries = [
make_trajectory_entry(resolved=False, attempt=1, category="assertion_error"),
make_trajectory_entry(resolved=True, attempt=2, category="success"),
]
self._populate_trajectories(tmp_path, entries)
builder = self._make_builder(tmp_path)
stats = builder.build(include_reflection_pairs=True)
assert stats.augmented_pairs >= 0 # may be 0 if problem too short
def test_stats_category_counts(self, tmp_path):
entries = [
make_trajectory_entry(category="assertion_error"),
make_trajectory_entry(category="assertion_error"),
make_trajectory_entry(category="syntax_error"),
]
self._populate_trajectories(tmp_path, entries)
builder = self._make_builder(tmp_path)
stats = builder.build(include_reflection_pairs=False)
assert stats.category_counts.get("assertion_error", 0) >= 1
# ── Evaluation report ─────────────────────────────────────────────────────────
class TestEvaluationReport:
def _make_report(self, n_resolved, n_total, variant="test_model"):
from fine_tuning.evaluator import EvaluationReport, EvalResult
results = []
for i in range(n_total):
results.append(EvalResult(
instance_id=f"inst-{i}",
repo="django/django",
resolved=(i < n_resolved),
attempts=1 if i < n_resolved else 3,
elapsed_seconds=10.0,
token_cost=1500,
patch="--- a/f.py\n+++ b/f.py\n",
failure_category="success" if i < n_resolved else "assertion_error",
model_variant=variant,
))
report = EvaluationReport(variant=variant, results=results)
return report
def test_pct_resolved(self):
report = self._make_report(30, 100)
assert abs(report.pct_resolved - 0.30) < 1e-6
def test_avg_attempts(self):
report = self._make_report(50, 100)
# 50 resolved at 1 attempt + 50 unresolved at 3 attempts = (50+150)/100 = 2.0
assert abs(report.avg_attempts - 2.0) < 1e-6
def test_save_and_load(self, tmp_path):
report = self._make_report(10, 50)
path = tmp_path / "report.json"
report.save(path)
assert path.exists()
data = json.loads(path.read_text())
assert data["summary"]["n_total"] == 50
assert data["summary"]["n_resolved"] == 10
def test_failure_breakdown(self):
report = self._make_report(10, 20)
breakdown = report.failure_breakdown
assert "success" in breakdown
assert "assertion_error" in breakdown
def test_to_ablation_row(self):
from fine_tuning.evaluator import AblationRow
report = self._make_report(35, 100, "DeepSeek fine-tuned")
row = report.to_ablation_row(recall_at_5=0.74)
assert isinstance(row, AblationRow)
assert abs(row.pct_resolved - 0.35) < 1e-6
assert row.recall_at_5 == 0.74
# ── Ablation Table ────────────────────────────────────────────────────────────
class TestAblationTableBuilder:
def test_includes_published_baselines(self):
from fine_tuning.evaluator import AblationTableBuilder
builder = AblationTableBuilder()
assert len(builder._rows) >= 2 # Devin + SWE-agent
def test_to_markdown_format(self):
from fine_tuning.evaluator import AblationTableBuilder, EvaluationReport, EvalResult
builder = AblationTableBuilder()
md = builder.to_markdown()
assert "| System Variant" in md
assert "| Resolved" in md
assert "Devin" in md
def test_add_report(self):
from fine_tuning.evaluator import AblationTableBuilder, EvaluationReport, EvalResult
builder = AblationTableBuilder()
initial_count = len(builder._rows)
report = EvaluationReport(variant="test", results=[
EvalResult("i1", "r", True, 1, 10.0, 1500, "p", "success", "test")
])
builder.add_report(report, recall_at_5=0.74)
assert len(builder._rows) == initial_count + 1
def test_save_markdown(self, tmp_path):
from fine_tuning.evaluator import AblationTableBuilder
builder = AblationTableBuilder()
path = tmp_path / "ablation.md"
builder.save_markdown(path)
assert path.exists()
content = path.read_text()
assert "Ablation Results" in content
def test_markdown_row_format(self):
from fine_tuning.evaluator import AblationRow
row = AblationRow(
system_variant="DeepSeek fine-tuned",
pct_resolved=0.41,
recall_at_5=0.74,
avg_attempts=1.6,
avg_token_cost=3200,
n_instances=300,
)
md_row = row.to_markdown_row()
assert "41.0%" in md_row
assert "74.0%" in md_row
assert "DeepSeek" in md_row
# ── Token count estimator ─────────────────────────────────────────────────────
class TestTokenCountEstimator:
def test_estimate_on_jsonl(self, tmp_path):
from fine_tuning.dataset_builder import estimate_token_counts
path = tmp_path / "data.jsonl"
data = [{"text": "hello world " * 100, "metadata": {}} for _ in range(10)]
with path.open("w") as f:
for d in data:
f.write(json.dumps(d) + "\n")
stats = estimate_token_counts(path)
assert stats["n_pairs"] == 10
assert stats["estimated_tokens"] > 0
assert "estimated_training_cost_usd" in stats
def test_empty_file_returns_zeros(self, tmp_path):
from fine_tuning.dataset_builder import estimate_token_counts
path = tmp_path / "empty.jsonl"
path.write_text("")
stats = estimate_token_counts(path)
assert stats["n_pairs"] == 0