Spaces:

SouravNath
/

repomind-api

Running

File size: 16,966 Bytes

dc71cad

"""
tests/test_phase7_finetuning.py
────────────────────────────────
Unit tests for Phase 7: dataset builder, QLoRA config, and evaluator.
All tests run without GPU, model download, or real trajectory files.

Run with: pytest tests/test_phase7_finetuning.py -v
"""
from __future__ import annotations

import json
import tempfile
from dataclasses import asdict
from pathlib import Path

import pytest


# ── Helpers ───────────────────────────────────────────────────────────────────

def make_trajectory_entry(
    resolved: bool = True,
    category: str = "assertion_error",
    patch: str = "--- a/foo.py\n+++ b/foo.py\n@@ -1 +1 @@\n-bug\n+fix\n",
    problem: str = "Fix the null pointer error in the queryset filter method call",
    attempt: int = 1,
    instance_id: str = "django__django-123",
) -> dict:
    return {
        "instance_id": instance_id,
        "repo": "django/django",
        "attempt": attempt,
        "patch": patch,
        "test_stdout": "AssertionError: expected True got False",
        "fail_to_pass_results": {"tests::test_x": resolved},
        "pass_to_pass_results": {},
        "resolved": resolved,
        "failure_category": category,
        "elapsed_seconds": 5.2,
        "token_cost": {"total_tokens": 1500},
        "localised_files": ["django/db/models/query.py"],
        "problem_statement": problem,
        "timestamp": "2025-05-01T00:00:00+00:00",
    }


def write_trajectory_jsonl(tmp_path: Path, entries: list[dict]) -> Path:
    """Write trajectory entries to a JSONL file."""
    p = tmp_path / "trajectories" / "test.jsonl"
    p.parent.mkdir(parents=True, exist_ok=True)
    with p.open("w") as f:
        for e in entries:
            f.write(json.dumps(e) + "\n")
    return p


# ── QLoRA Config ──────────────────────────────────────────────────────────────

class TestQLoRAConfig:
    def test_default_config(self):
        from fine_tuning.qlora_config import TrainingConfig
        cfg = TrainingConfig()
        assert cfg.model_name == "deepseek-ai/deepseek-coder-7b-instruct-v1.5"
        assert cfg.lora.r == 16
        assert cfg.lora.lora_alpha == 32

    def test_lora_scaling(self):
        from fine_tuning.qlora_config import LoRAConfig
        lora = LoRAConfig(r=16, lora_alpha=32)
        assert lora.scaling == 2.0  # 32/16

    def test_effective_batch_size(self):
        from fine_tuning.qlora_config import TrainingConfig
        cfg = TrainingConfig(
            per_device_train_batch_size=4,
            gradient_accumulation_steps=4,
        )
        assert cfg.effective_batch_size == 16

    def test_lora_targets_include_mlp(self):
        from fine_tuning.qlora_config import LoRAConfig
        lora = LoRAConfig()
        assert "gate_proj" in lora.target_modules
        assert "up_proj" in lora.target_modules
        assert "down_proj" in lora.target_modules

    def test_bnb_config_defaults(self):
        from fine_tuning.qlora_config import BitsAndBytesConfig
        bnb = BitsAndBytesConfig()
        assert bnb.load_in_4bit is True
        assert bnb.bnb_4bit_quant_type == "nf4"
        assert bnb.bnb_4bit_use_double_quant is True

    def test_vram_estimate_positive(self):
        from fine_tuning.qlora_config import TrainingConfig
        cfg = TrainingConfig()
        assert cfg.estimate_vram_gb() > 4.0  # at least model size

    def test_get_config_variants(self):
        from fine_tuning.qlora_config import get_config
        for variant in ["default", "small_r", "large_r", "no_mlp", "longer", "qwen"]:
            cfg = get_config(variant)
            assert cfg.model_name is not None

    def test_get_config_invalid_raises(self):
        from fine_tuning.qlora_config import get_config
        with pytest.raises(ValueError, match="Unknown variant"):
            get_config("nonexistent_variant")

    def test_small_r_has_lower_r(self):
        from fine_tuning.qlora_config import get_config
        default_cfg = get_config("default")
        small_r_cfg = get_config("small_r")
        assert small_r_cfg.lora.r < default_cfg.lora.r

    def test_output_path_is_path(self):
        from fine_tuning.qlora_config import TrainingConfig
        cfg = TrainingConfig()
        assert isinstance(cfg.output_path, Path)


# ── Training Pair formatting ──────────────────────────────────────────────────

class TestTrainingPair:
    def _make_pair(self):
        from fine_tuning.dataset_builder import TrainingPair
        return TrainingPair(
            system="You are an engineer.",
            user="Fix the bug:\n## Issue\nDescription",
            assistant="--- a/foo.py\n+++ b/foo.py\n",
            metadata={"instance_id": "test-1"},
        )

    def test_to_chatml_format(self):
        pair = self._make_pair()
        chatml = pair.to_chatml()
        assert "<|im_start|>system" in chatml
        assert "<|im_start|>user" in chatml
        assert "<|im_start|>assistant" in chatml
        assert "<|im_end|>" in chatml

    def test_to_alpaca_format(self):
        pair = self._make_pair()
        alpaca = pair.to_alpaca()
        assert "instruction" in alpaca
        assert "output" in alpaca
        assert alpaca["output"] == "--- a/foo.py\n+++ b/foo.py\n"

    def test_to_sharegpt_format(self):
        pair = self._make_pair()
        sg = pair.to_sharegpt()
        assert "conversations" in sg
        roles = [c["from"] for c in sg["conversations"]]
        assert roles == ["system", "human", "gpt"]

    def test_to_openai_format(self):
        pair = self._make_pair()
        oai = pair.to_openai()
        assert "messages" in oai
        roles = [m["role"] for m in oai["messages"]]
        assert roles == ["system", "user", "assistant"]

    def test_chatml_contains_content(self):
        pair = self._make_pair()
        chatml = pair.to_chatml()
        assert "You are an engineer" in chatml
        assert "Fix the bug" in chatml
        assert "--- a/foo.py" in chatml


# ── Dataset Builder ───────────────────────────────────────────────────────────

class TestFinetuningDatasetBuilder:
    def _make_builder(self, tmp_path):
        from fine_tuning.dataset_builder import FinetuningDatasetBuilder
        return FinetuningDatasetBuilder(
            trajectory_dir=tmp_path / "trajectories",
            output_dir=tmp_path / "output",
            val_fraction=0.2,
            min_problem_words=5,  # relaxed for testing
        )

    def _populate_trajectories(self, tmp_path, entries: list[dict]) -> Path:
        return write_trajectory_jsonl(tmp_path, entries)

    def test_empty_trajectory_dir(self, tmp_path):
        from fine_tuning.dataset_builder import FinetuningDatasetBuilder
        builder = FinetuningDatasetBuilder(
            trajectory_dir=tmp_path / "nonexistent",
            output_dir=tmp_path / "out",
        )
        stats = builder.build()
        assert stats.total_trajectories == 0
        assert stats.train_size == 0

    def test_builds_from_valid_trajectories(self, tmp_path):
        entries = [make_trajectory_entry(resolved=True) for _ in range(10)]
        self._populate_trajectories(tmp_path, entries)

        builder = self._make_builder(tmp_path)
        stats = builder.build(include_reflection_pairs=False)

        assert stats.total_trajectories == 10
        assert stats.train_size + stats.val_size > 0

    def test_filters_unknown_category(self, tmp_path):
        entries = [
            make_trajectory_entry(category="assertion_error"),
            make_trajectory_entry(category="unknown"),   # should be filtered
        ]
        self._populate_trajectories(tmp_path, entries)
        builder = self._make_builder(tmp_path)
        stats = builder.build(include_reflection_pairs=False)
        assert stats.filter_reasons.get("unknown_category", 0) >= 1

    def test_filters_empty_patch(self, tmp_path):
        entries = [make_trajectory_entry(patch="")]
        self._populate_trajectories(tmp_path, entries)
        builder = self._make_builder(tmp_path)
        stats = builder.build(include_reflection_pairs=False)
        assert stats.filter_reasons.get("empty_patch", 0) >= 1

    def test_filters_invalid_patch_format(self, tmp_path):
        entries = [make_trajectory_entry(patch="just some text")]
        self._populate_trajectories(tmp_path, entries)
        builder = self._make_builder(tmp_path)
        stats = builder.build(include_reflection_pairs=False)
        assert stats.filter_reasons.get("invalid_patch_format", 0) >= 1

    def test_train_val_split(self, tmp_path):
        entries = [make_trajectory_entry() for _ in range(20)]
        self._populate_trajectories(tmp_path, entries)
        builder = self._make_builder(tmp_path)
        stats = builder.build(include_reflection_pairs=False)
        # val should be ~20% of (train + val)
        total = stats.train_size + stats.val_size
        assert total > 0
        val_ratio = stats.val_size / total
        assert 0.05 < val_ratio < 0.50  # flexible for small datasets

    def test_output_files_created(self, tmp_path):
        entries = [make_trajectory_entry() for _ in range(5)]
        self._populate_trajectories(tmp_path, entries)
        builder = self._make_builder(tmp_path)
        builder.build(include_reflection_pairs=False)
        assert (tmp_path / "output" / "train.jsonl").exists()
        assert (tmp_path / "output" / "val.jsonl").exists()
        assert (tmp_path / "output" / "dataset_stats.json").exists()

    def test_chatml_format_output(self, tmp_path):
        entries = [make_trajectory_entry() for _ in range(5)]
        self._populate_trajectories(tmp_path, entries)
        builder = self._make_builder(tmp_path)
        builder.build(format="chatml", include_reflection_pairs=False)

        train_path = tmp_path / "output" / "train.jsonl"
        if train_path.exists() and train_path.stat().st_size > 0:
            with train_path.open() as f:
                first = json.loads(f.readline())
            assert "text" in first
            assert "<|im_start|>" in first["text"]

    def test_reflection_pairs_from_multi_attempt(self, tmp_path):
        """Multi-attempt instances should generate reflection pairs."""
        entries = [
            make_trajectory_entry(resolved=False, attempt=1, category="assertion_error"),
            make_trajectory_entry(resolved=True,  attempt=2, category="success"),
        ]
        self._populate_trajectories(tmp_path, entries)
        builder = self._make_builder(tmp_path)
        stats = builder.build(include_reflection_pairs=True)
        assert stats.augmented_pairs >= 0  # may be 0 if problem too short

    def test_stats_category_counts(self, tmp_path):
        entries = [
            make_trajectory_entry(category="assertion_error"),
            make_trajectory_entry(category="assertion_error"),
            make_trajectory_entry(category="syntax_error"),
        ]
        self._populate_trajectories(tmp_path, entries)
        builder = self._make_builder(tmp_path)
        stats = builder.build(include_reflection_pairs=False)
        assert stats.category_counts.get("assertion_error", 0) >= 1


# ── Evaluation report ─────────────────────────────────────────────────────────

class TestEvaluationReport:
    def _make_report(self, n_resolved, n_total, variant="test_model"):
        from fine_tuning.evaluator import EvaluationReport, EvalResult
        results = []
        for i in range(n_total):
            results.append(EvalResult(
                instance_id=f"inst-{i}",
                repo="django/django",
                resolved=(i < n_resolved),
                attempts=1 if i < n_resolved else 3,
                elapsed_seconds=10.0,
                token_cost=1500,
                patch="--- a/f.py\n+++ b/f.py\n",
                failure_category="success" if i < n_resolved else "assertion_error",
                model_variant=variant,
            ))
        report = EvaluationReport(variant=variant, results=results)
        return report

    def test_pct_resolved(self):
        report = self._make_report(30, 100)
        assert abs(report.pct_resolved - 0.30) < 1e-6

    def test_avg_attempts(self):
        report = self._make_report(50, 100)
        # 50 resolved at 1 attempt + 50 unresolved at 3 attempts = (50+150)/100 = 2.0
        assert abs(report.avg_attempts - 2.0) < 1e-6

    def test_save_and_load(self, tmp_path):
        report = self._make_report(10, 50)
        path = tmp_path / "report.json"
        report.save(path)
        assert path.exists()
        data = json.loads(path.read_text())
        assert data["summary"]["n_total"] == 50
        assert data["summary"]["n_resolved"] == 10

    def test_failure_breakdown(self):
        report = self._make_report(10, 20)
        breakdown = report.failure_breakdown
        assert "success" in breakdown
        assert "assertion_error" in breakdown

    def test_to_ablation_row(self):
        from fine_tuning.evaluator import AblationRow
        report = self._make_report(35, 100, "DeepSeek fine-tuned")
        row = report.to_ablation_row(recall_at_5=0.74)
        assert isinstance(row, AblationRow)
        assert abs(row.pct_resolved - 0.35) < 1e-6
        assert row.recall_at_5 == 0.74


# ── Ablation Table ────────────────────────────────────────────────────────────

class TestAblationTableBuilder:
    def test_includes_published_baselines(self):
        from fine_tuning.evaluator import AblationTableBuilder
        builder = AblationTableBuilder()
        assert len(builder._rows) >= 2  # Devin + SWE-agent

    def test_to_markdown_format(self):
        from fine_tuning.evaluator import AblationTableBuilder, EvaluationReport, EvalResult
        builder = AblationTableBuilder()
        md = builder.to_markdown()
        assert "| System Variant" in md
        assert "| Resolved" in md
        assert "Devin" in md

    def test_add_report(self):
        from fine_tuning.evaluator import AblationTableBuilder, EvaluationReport, EvalResult
        builder = AblationTableBuilder()
        initial_count = len(builder._rows)

        report = EvaluationReport(variant="test", results=[
            EvalResult("i1", "r", True, 1, 10.0, 1500, "p", "success", "test")
        ])
        builder.add_report(report, recall_at_5=0.74)
        assert len(builder._rows) == initial_count + 1

    def test_save_markdown(self, tmp_path):
        from fine_tuning.evaluator import AblationTableBuilder
        builder = AblationTableBuilder()
        path = tmp_path / "ablation.md"
        builder.save_markdown(path)
        assert path.exists()
        content = path.read_text()
        assert "Ablation Results" in content

    def test_markdown_row_format(self):
        from fine_tuning.evaluator import AblationRow
        row = AblationRow(
            system_variant="DeepSeek fine-tuned",
            pct_resolved=0.41,
            recall_at_5=0.74,
            avg_attempts=1.6,
            avg_token_cost=3200,
            n_instances=300,
        )
        md_row = row.to_markdown_row()
        assert "41.0%" in md_row
        assert "74.0%" in md_row
        assert "DeepSeek" in md_row


# ── Token count estimator ─────────────────────────────────────────────────────

class TestTokenCountEstimator:
    def test_estimate_on_jsonl(self, tmp_path):
        from fine_tuning.dataset_builder import estimate_token_counts
        path = tmp_path / "data.jsonl"
        data = [{"text": "hello world " * 100, "metadata": {}} for _ in range(10)]
        with path.open("w") as f:
            for d in data:
                f.write(json.dumps(d) + "\n")

        stats = estimate_token_counts(path)
        assert stats["n_pairs"] == 10
        assert stats["estimated_tokens"] > 0
        assert "estimated_training_cost_usd" in stats

    def test_empty_file_returns_zeros(self, tmp_path):
        from fine_tuning.dataset_builder import estimate_token_counts
        path = tmp_path / "empty.jsonl"
        path.write_text("")
        stats = estimate_token_counts(path)
        assert stats["n_pairs"] == 0