File size: 3,115 Bytes
03815d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
"""Render leakage-clean slice chart from `logs/leakage_clean_slice.json`.

Shows detection rate on the full bench (n=175) vs the leakage-clean slice
(n=50; max cosine sim to nearest training text < 0.7) for each provider
we have per-row scores for. The point: detection holds up on novel text.

Output: plots/chakravyuh_plots/leakage_clean_slice.png
"""

from __future__ import annotations

import json
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np


_PROVIDER_LABELS = {
    "scripted": "scripted rules",
    "hf-deepseek-r1": "DeepSeek-R1",
    "hf-deepseek-v3-0324": "DeepSeek-V3",
    "hf-gemma-3-27b-it": "Gemma-3-27B",
    "hf-gpt-oss-120b": "GPT-OSS-120B",
    "hf-llama-3.3-70b-instruct": "Llama-3.3-70B",
    "hf-qwen2.5-72b-instruct": "Qwen2.5-72B",
    "hf-qwen2.5-7b-instruct": "Qwen2.5-7B",
}


def main() -> int:
    src = Path("logs/leakage_clean_slice.json")
    out = Path("plots/chakravyuh_plots/leakage_clean_slice.png")
    data = json.loads(src.read_text(encoding="utf-8"))
    rows = data["rows"]

    rows_sorted = sorted(rows, key=lambda r: r["leakage_clean"]["detection"], reverse=True)
    labels = [_PROVIDER_LABELS.get(r["provider"], r["provider"]) for r in rows_sorted]
    full = [r["full_bench"]["detection"] for r in rows_sorted]
    clean = [r["leakage_clean"]["detection"] for r in rows_sorted]
    deltas = [r["leakage_clean_delta"]["detection_pp"] for r in rows_sorted]

    fig, ax = plt.subplots(figsize=(10, 6))
    x = np.arange(len(labels))
    w = 0.38
    ax.bar(x - w / 2, full, w, label=f"full bench (n={data['rows'][0]['full_bench']['n_scam'] + data['rows'][0]['full_bench']['n_benign']})",
           color="#1565c0", edgecolor="black", linewidth=0.5)
    ax.bar(x + w / 2, clean, w, label=f"leakage-clean (n={data['rows'][0]['leakage_clean']['n_scam'] + data['rows'][0]['leakage_clean']['n_benign']})",
           color="#558b2f", edgecolor="black", linewidth=0.5)

    for i, (f, c, d) in enumerate(zip(full, clean, deltas)):
        ax.text(i - w / 2, f + 0.015, f"{f:.2f}", ha="center", fontsize=8)
        ax.text(i + w / 2, c + 0.015, f"{c:.2f}", ha="center", fontsize=8)
        sign = "+" if d >= 0 else ""
        ax.text(i, max(f, c) + 0.08, f"Δ {sign}{d:.1f}pp", ha="center",
                fontsize=8, color="#c62828" if d < -1 else "#37474f")

    ax.set_xticks(x)
    ax.set_xticklabels(labels, rotation=30, ha="right", fontsize=9)
    ax.set_ylabel("Detection rate", fontsize=11)
    ax.set_ylim(0, 1.15)
    ax.set_title(
        f"Leakage-clean slice — cosine gate {data['cosine_gate']} to nearest training text\n"
        "If detection drops on the clean slice, the model leaned on memorisation",
        fontsize=11, fontweight="bold",
    )
    ax.legend(loc="upper right", fontsize=9, framealpha=0.95)
    ax.grid(True, alpha=0.3, axis="y")
    fig.tight_layout()
    out.parent.mkdir(parents=True, exist_ok=True)
    fig.savefig(out, dpi=120, bbox_inches="tight")
    plt.close(fig)
    print(f"Wrote {out} ({out.stat().st_size:,} bytes)")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())