Spaces:
Running
Running
| """B.4 — schema + content tests for benign_augmented_v2.json.""" | |
| from __future__ import annotations | |
| import json | |
| from pathlib import Path | |
| import pytest | |
| _PATH = Path("chakravyuh_env/benign_augmented_v2.json") | |
| def corpus() -> dict: | |
| return json.loads(_PATH.read_text()) | |
| def test_corpus_top_level_keys(corpus: dict) -> None: | |
| for key in ("description", "version", "source", "split_intent", "templates"): | |
| assert key in corpus, f"missing top-level key: {key!r}" | |
| def test_corpus_intent_is_training_only(corpus: dict) -> None: | |
| assert "training" in corpus["split_intent"].lower(), ( | |
| "benign_augmented_v2 must be flagged training-only — " | |
| "it must not leak into the v0.2 test bench" | |
| ) | |
| def test_corpus_size(corpus: dict) -> None: | |
| n = len(corpus["templates"]) | |
| assert n >= 60, f"expected >= 60 templates, got {n}" | |
| def test_template_ids_are_unique(corpus: dict) -> None: | |
| ids = [t["id"] for t in corpus["templates"]] | |
| assert len(ids) == len(set(ids)), "duplicate template ids" | |
| def test_each_template_has_required_fields(corpus: dict) -> None: | |
| for t in corpus["templates"]: | |
| assert "id" in t and "category" in t and "text" in t, ( | |
| f"template missing required field: {t}" | |
| ) | |
| assert t["text"].strip(), f"template {t['id']} has empty text" | |
| def test_hard_negatives_have_explanation(corpus: dict) -> None: | |
| """If marked hard_negative=True, must include 'why_hard' justification.""" | |
| for t in corpus["templates"]: | |
| if t.get("hard_negative"): | |
| assert "why_hard" in t and t["why_hard"].strip(), ( | |
| f"hard negative {t['id']} missing 'why_hard' justification" | |
| ) | |
| def test_no_overlap_with_test_bench(corpus: dict) -> None: | |
| """No new template's text exactly matches a bench scenario (soft anti-leakage). | |
| Cheap exact-string match — a real soft-leakage filter (min-hash + Jaccard) | |
| is in training/grpo_analyzer.py:_filter_soft_leakage and runs at training | |
| time. This test is a fast first-line sanity check. | |
| """ | |
| bench = Path("data/chakravyuh-bench-v0/scenarios.jsonl") | |
| if not bench.exists(): | |
| pytest.skip("bench file missing") | |
| bench_texts: set[str] = set() | |
| with bench.open() as f: | |
| for line in f: | |
| row = json.loads(line) | |
| for turn in row.get("attack_sequence", []): | |
| bench_texts.add(turn.get("text", "").strip()) | |
| for t in corpus["templates"]: | |
| assert t["text"].strip() not in bench_texts, ( | |
| f"template {t['id']} duplicates a bench scenario verbatim" | |
| ) | |