Spaces:
Running
Running
| """ | |
| tests/test_phase7_finetuning.py | |
| ββββββββββββββββββββββββββββββββ | |
| Unit tests for Phase 7: dataset builder, QLoRA config, and evaluator. | |
| All tests run without GPU, model download, or real trajectory files. | |
| Run with: pytest tests/test_phase7_finetuning.py -v | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import tempfile | |
| from dataclasses import asdict | |
| from pathlib import Path | |
| import pytest | |
| # ββ Helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def make_trajectory_entry( | |
| resolved: bool = True, | |
| category: str = "assertion_error", | |
| patch: str = "--- a/foo.py\n+++ b/foo.py\n@@ -1 +1 @@\n-bug\n+fix\n", | |
| problem: str = "Fix the null pointer error in the queryset filter method call", | |
| attempt: int = 1, | |
| instance_id: str = "django__django-123", | |
| ) -> dict: | |
| return { | |
| "instance_id": instance_id, | |
| "repo": "django/django", | |
| "attempt": attempt, | |
| "patch": patch, | |
| "test_stdout": "AssertionError: expected True got False", | |
| "fail_to_pass_results": {"tests::test_x": resolved}, | |
| "pass_to_pass_results": {}, | |
| "resolved": resolved, | |
| "failure_category": category, | |
| "elapsed_seconds": 5.2, | |
| "token_cost": {"total_tokens": 1500}, | |
| "localised_files": ["django/db/models/query.py"], | |
| "problem_statement": problem, | |
| "timestamp": "2025-05-01T00:00:00+00:00", | |
| } | |
| def write_trajectory_jsonl(tmp_path: Path, entries: list[dict]) -> Path: | |
| """Write trajectory entries to a JSONL file.""" | |
| p = tmp_path / "trajectories" / "test.jsonl" | |
| p.parent.mkdir(parents=True, exist_ok=True) | |
| with p.open("w") as f: | |
| for e in entries: | |
| f.write(json.dumps(e) + "\n") | |
| return p | |
| # ββ QLoRA Config ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestQLoRAConfig: | |
| def test_default_config(self): | |
| from fine_tuning.qlora_config import TrainingConfig | |
| cfg = TrainingConfig() | |
| assert cfg.model_name == "deepseek-ai/deepseek-coder-7b-instruct-v1.5" | |
| assert cfg.lora.r == 16 | |
| assert cfg.lora.lora_alpha == 32 | |
| def test_lora_scaling(self): | |
| from fine_tuning.qlora_config import LoRAConfig | |
| lora = LoRAConfig(r=16, lora_alpha=32) | |
| assert lora.scaling == 2.0 # 32/16 | |
| def test_effective_batch_size(self): | |
| from fine_tuning.qlora_config import TrainingConfig | |
| cfg = TrainingConfig( | |
| per_device_train_batch_size=4, | |
| gradient_accumulation_steps=4, | |
| ) | |
| assert cfg.effective_batch_size == 16 | |
| def test_lora_targets_include_mlp(self): | |
| from fine_tuning.qlora_config import LoRAConfig | |
| lora = LoRAConfig() | |
| assert "gate_proj" in lora.target_modules | |
| assert "up_proj" in lora.target_modules | |
| assert "down_proj" in lora.target_modules | |
| def test_bnb_config_defaults(self): | |
| from fine_tuning.qlora_config import BitsAndBytesConfig | |
| bnb = BitsAndBytesConfig() | |
| assert bnb.load_in_4bit is True | |
| assert bnb.bnb_4bit_quant_type == "nf4" | |
| assert bnb.bnb_4bit_use_double_quant is True | |
| def test_vram_estimate_positive(self): | |
| from fine_tuning.qlora_config import TrainingConfig | |
| cfg = TrainingConfig() | |
| assert cfg.estimate_vram_gb() > 4.0 # at least model size | |
| def test_get_config_variants(self): | |
| from fine_tuning.qlora_config import get_config | |
| for variant in ["default", "small_r", "large_r", "no_mlp", "longer", "qwen"]: | |
| cfg = get_config(variant) | |
| assert cfg.model_name is not None | |
| def test_get_config_invalid_raises(self): | |
| from fine_tuning.qlora_config import get_config | |
| with pytest.raises(ValueError, match="Unknown variant"): | |
| get_config("nonexistent_variant") | |
| def test_small_r_has_lower_r(self): | |
| from fine_tuning.qlora_config import get_config | |
| default_cfg = get_config("default") | |
| small_r_cfg = get_config("small_r") | |
| assert small_r_cfg.lora.r < default_cfg.lora.r | |
| def test_output_path_is_path(self): | |
| from fine_tuning.qlora_config import TrainingConfig | |
| cfg = TrainingConfig() | |
| assert isinstance(cfg.output_path, Path) | |
| # ββ Training Pair formatting ββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestTrainingPair: | |
| def _make_pair(self): | |
| from fine_tuning.dataset_builder import TrainingPair | |
| return TrainingPair( | |
| system="You are an engineer.", | |
| user="Fix the bug:\n## Issue\nDescription", | |
| assistant="--- a/foo.py\n+++ b/foo.py\n", | |
| metadata={"instance_id": "test-1"}, | |
| ) | |
| def test_to_chatml_format(self): | |
| pair = self._make_pair() | |
| chatml = pair.to_chatml() | |
| assert "<|im_start|>system" in chatml | |
| assert "<|im_start|>user" in chatml | |
| assert "<|im_start|>assistant" in chatml | |
| assert "<|im_end|>" in chatml | |
| def test_to_alpaca_format(self): | |
| pair = self._make_pair() | |
| alpaca = pair.to_alpaca() | |
| assert "instruction" in alpaca | |
| assert "output" in alpaca | |
| assert alpaca["output"] == "--- a/foo.py\n+++ b/foo.py\n" | |
| def test_to_sharegpt_format(self): | |
| pair = self._make_pair() | |
| sg = pair.to_sharegpt() | |
| assert "conversations" in sg | |
| roles = [c["from"] for c in sg["conversations"]] | |
| assert roles == ["system", "human", "gpt"] | |
| def test_to_openai_format(self): | |
| pair = self._make_pair() | |
| oai = pair.to_openai() | |
| assert "messages" in oai | |
| roles = [m["role"] for m in oai["messages"]] | |
| assert roles == ["system", "user", "assistant"] | |
| def test_chatml_contains_content(self): | |
| pair = self._make_pair() | |
| chatml = pair.to_chatml() | |
| assert "You are an engineer" in chatml | |
| assert "Fix the bug" in chatml | |
| assert "--- a/foo.py" in chatml | |
| # ββ Dataset Builder βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestFinetuningDatasetBuilder: | |
| def _make_builder(self, tmp_path): | |
| from fine_tuning.dataset_builder import FinetuningDatasetBuilder | |
| return FinetuningDatasetBuilder( | |
| trajectory_dir=tmp_path / "trajectories", | |
| output_dir=tmp_path / "output", | |
| val_fraction=0.2, | |
| min_problem_words=5, # relaxed for testing | |
| ) | |
| def _populate_trajectories(self, tmp_path, entries: list[dict]) -> Path: | |
| return write_trajectory_jsonl(tmp_path, entries) | |
| def test_empty_trajectory_dir(self, tmp_path): | |
| from fine_tuning.dataset_builder import FinetuningDatasetBuilder | |
| builder = FinetuningDatasetBuilder( | |
| trajectory_dir=tmp_path / "nonexistent", | |
| output_dir=tmp_path / "out", | |
| ) | |
| stats = builder.build() | |
| assert stats.total_trajectories == 0 | |
| assert stats.train_size == 0 | |
| def test_builds_from_valid_trajectories(self, tmp_path): | |
| entries = [make_trajectory_entry(resolved=True) for _ in range(10)] | |
| self._populate_trajectories(tmp_path, entries) | |
| builder = self._make_builder(tmp_path) | |
| stats = builder.build(include_reflection_pairs=False) | |
| assert stats.total_trajectories == 10 | |
| assert stats.train_size + stats.val_size > 0 | |
| def test_filters_unknown_category(self, tmp_path): | |
| entries = [ | |
| make_trajectory_entry(category="assertion_error"), | |
| make_trajectory_entry(category="unknown"), # should be filtered | |
| ] | |
| self._populate_trajectories(tmp_path, entries) | |
| builder = self._make_builder(tmp_path) | |
| stats = builder.build(include_reflection_pairs=False) | |
| assert stats.filter_reasons.get("unknown_category", 0) >= 1 | |
| def test_filters_empty_patch(self, tmp_path): | |
| entries = [make_trajectory_entry(patch="")] | |
| self._populate_trajectories(tmp_path, entries) | |
| builder = self._make_builder(tmp_path) | |
| stats = builder.build(include_reflection_pairs=False) | |
| assert stats.filter_reasons.get("empty_patch", 0) >= 1 | |
| def test_filters_invalid_patch_format(self, tmp_path): | |
| entries = [make_trajectory_entry(patch="just some text")] | |
| self._populate_trajectories(tmp_path, entries) | |
| builder = self._make_builder(tmp_path) | |
| stats = builder.build(include_reflection_pairs=False) | |
| assert stats.filter_reasons.get("invalid_patch_format", 0) >= 1 | |
| def test_train_val_split(self, tmp_path): | |
| entries = [make_trajectory_entry() for _ in range(20)] | |
| self._populate_trajectories(tmp_path, entries) | |
| builder = self._make_builder(tmp_path) | |
| stats = builder.build(include_reflection_pairs=False) | |
| # val should be ~20% of (train + val) | |
| total = stats.train_size + stats.val_size | |
| assert total > 0 | |
| val_ratio = stats.val_size / total | |
| assert 0.05 < val_ratio < 0.50 # flexible for small datasets | |
| def test_output_files_created(self, tmp_path): | |
| entries = [make_trajectory_entry() for _ in range(5)] | |
| self._populate_trajectories(tmp_path, entries) | |
| builder = self._make_builder(tmp_path) | |
| builder.build(include_reflection_pairs=False) | |
| assert (tmp_path / "output" / "train.jsonl").exists() | |
| assert (tmp_path / "output" / "val.jsonl").exists() | |
| assert (tmp_path / "output" / "dataset_stats.json").exists() | |
| def test_chatml_format_output(self, tmp_path): | |
| entries = [make_trajectory_entry() for _ in range(5)] | |
| self._populate_trajectories(tmp_path, entries) | |
| builder = self._make_builder(tmp_path) | |
| builder.build(format="chatml", include_reflection_pairs=False) | |
| train_path = tmp_path / "output" / "train.jsonl" | |
| if train_path.exists() and train_path.stat().st_size > 0: | |
| with train_path.open() as f: | |
| first = json.loads(f.readline()) | |
| assert "text" in first | |
| assert "<|im_start|>" in first["text"] | |
| def test_reflection_pairs_from_multi_attempt(self, tmp_path): | |
| """Multi-attempt instances should generate reflection pairs.""" | |
| entries = [ | |
| make_trajectory_entry(resolved=False, attempt=1, category="assertion_error"), | |
| make_trajectory_entry(resolved=True, attempt=2, category="success"), | |
| ] | |
| self._populate_trajectories(tmp_path, entries) | |
| builder = self._make_builder(tmp_path) | |
| stats = builder.build(include_reflection_pairs=True) | |
| assert stats.augmented_pairs >= 0 # may be 0 if problem too short | |
| def test_stats_category_counts(self, tmp_path): | |
| entries = [ | |
| make_trajectory_entry(category="assertion_error"), | |
| make_trajectory_entry(category="assertion_error"), | |
| make_trajectory_entry(category="syntax_error"), | |
| ] | |
| self._populate_trajectories(tmp_path, entries) | |
| builder = self._make_builder(tmp_path) | |
| stats = builder.build(include_reflection_pairs=False) | |
| assert stats.category_counts.get("assertion_error", 0) >= 1 | |
| # ββ Evaluation report βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestEvaluationReport: | |
| def _make_report(self, n_resolved, n_total, variant="test_model"): | |
| from fine_tuning.evaluator import EvaluationReport, EvalResult | |
| results = [] | |
| for i in range(n_total): | |
| results.append(EvalResult( | |
| instance_id=f"inst-{i}", | |
| repo="django/django", | |
| resolved=(i < n_resolved), | |
| attempts=1 if i < n_resolved else 3, | |
| elapsed_seconds=10.0, | |
| token_cost=1500, | |
| patch="--- a/f.py\n+++ b/f.py\n", | |
| failure_category="success" if i < n_resolved else "assertion_error", | |
| model_variant=variant, | |
| )) | |
| report = EvaluationReport(variant=variant, results=results) | |
| return report | |
| def test_pct_resolved(self): | |
| report = self._make_report(30, 100) | |
| assert abs(report.pct_resolved - 0.30) < 1e-6 | |
| def test_avg_attempts(self): | |
| report = self._make_report(50, 100) | |
| # 50 resolved at 1 attempt + 50 unresolved at 3 attempts = (50+150)/100 = 2.0 | |
| assert abs(report.avg_attempts - 2.0) < 1e-6 | |
| def test_save_and_load(self, tmp_path): | |
| report = self._make_report(10, 50) | |
| path = tmp_path / "report.json" | |
| report.save(path) | |
| assert path.exists() | |
| data = json.loads(path.read_text()) | |
| assert data["summary"]["n_total"] == 50 | |
| assert data["summary"]["n_resolved"] == 10 | |
| def test_failure_breakdown(self): | |
| report = self._make_report(10, 20) | |
| breakdown = report.failure_breakdown | |
| assert "success" in breakdown | |
| assert "assertion_error" in breakdown | |
| def test_to_ablation_row(self): | |
| from fine_tuning.evaluator import AblationRow | |
| report = self._make_report(35, 100, "DeepSeek fine-tuned") | |
| row = report.to_ablation_row(recall_at_5=0.74) | |
| assert isinstance(row, AblationRow) | |
| assert abs(row.pct_resolved - 0.35) < 1e-6 | |
| assert row.recall_at_5 == 0.74 | |
| # ββ Ablation Table ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestAblationTableBuilder: | |
| def test_includes_published_baselines(self): | |
| from fine_tuning.evaluator import AblationTableBuilder | |
| builder = AblationTableBuilder() | |
| assert len(builder._rows) >= 2 # Devin + SWE-agent | |
| def test_to_markdown_format(self): | |
| from fine_tuning.evaluator import AblationTableBuilder, EvaluationReport, EvalResult | |
| builder = AblationTableBuilder() | |
| md = builder.to_markdown() | |
| assert "| System Variant" in md | |
| assert "| Resolved" in md | |
| assert "Devin" in md | |
| def test_add_report(self): | |
| from fine_tuning.evaluator import AblationTableBuilder, EvaluationReport, EvalResult | |
| builder = AblationTableBuilder() | |
| initial_count = len(builder._rows) | |
| report = EvaluationReport(variant="test", results=[ | |
| EvalResult("i1", "r", True, 1, 10.0, 1500, "p", "success", "test") | |
| ]) | |
| builder.add_report(report, recall_at_5=0.74) | |
| assert len(builder._rows) == initial_count + 1 | |
| def test_save_markdown(self, tmp_path): | |
| from fine_tuning.evaluator import AblationTableBuilder | |
| builder = AblationTableBuilder() | |
| path = tmp_path / "ablation.md" | |
| builder.save_markdown(path) | |
| assert path.exists() | |
| content = path.read_text() | |
| assert "Ablation Results" in content | |
| def test_markdown_row_format(self): | |
| from fine_tuning.evaluator import AblationRow | |
| row = AblationRow( | |
| system_variant="DeepSeek fine-tuned", | |
| pct_resolved=0.41, | |
| recall_at_5=0.74, | |
| avg_attempts=1.6, | |
| avg_token_cost=3200, | |
| n_instances=300, | |
| ) | |
| md_row = row.to_markdown_row() | |
| assert "41.0%" in md_row | |
| assert "74.0%" in md_row | |
| assert "DeepSeek" in md_row | |
| # ββ Token count estimator βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestTokenCountEstimator: | |
| def test_estimate_on_jsonl(self, tmp_path): | |
| from fine_tuning.dataset_builder import estimate_token_counts | |
| path = tmp_path / "data.jsonl" | |
| data = [{"text": "hello world " * 100, "metadata": {}} for _ in range(10)] | |
| with path.open("w") as f: | |
| for d in data: | |
| f.write(json.dumps(d) + "\n") | |
| stats = estimate_token_counts(path) | |
| assert stats["n_pairs"] == 10 | |
| assert stats["estimated_tokens"] > 0 | |
| assert "estimated_training_cost_usd" in stats | |
| def test_empty_file_returns_zeros(self, tmp_path): | |
| from fine_tuning.dataset_builder import estimate_token_counts | |
| path = tmp_path / "empty.jsonl" | |
| path.write_text("") | |
| stats = estimate_token_counts(path) | |
| assert stats["n_pairs"] == 0 | |