from __future__ import annotations import json from pathlib import Path import matplotlib.pyplot as plt import numpy as np from matplotlib.patches import FancyBboxPatch ROOT = Path(__file__).resolve().parents[1] FIGURES = ROOT / "figures" DATA = ROOT / "data" def load_json(path: Path) -> dict: return json.loads(path.read_text(encoding="utf-8")) def first_result(root: Path, pattern: str) -> Path: matches = sorted(root.glob(pattern)) if not matches: raise FileNotFoundError(pattern) return matches[0] def pct(value: float) -> str: return f"{value * 100:.1f}%" def add_gradient_bar(ax, patch, top_color: str, bottom_color: str) -> None: x, y = patch.get_x(), patch.get_y() w, h = patch.get_width(), patch.get_height() if h <= 0: return def rgb(hex_color: str) -> np.ndarray: hex_color = hex_color.lstrip("#") return np.array([int(hex_color[i : i + 2], 16) / 255 for i in (0, 2, 4)]) top = rgb(top_color) bottom = rgb(bottom_color) gradient = np.linspace(bottom, top, 256).reshape(256, 1, 3) ax.imshow( gradient, extent=[x, x + w, y, y + h], origin="lower", aspect="auto", clip_path=patch, clip_on=True, zorder=patch.get_zorder() + 0.1, ) patch.set_facecolor((1, 1, 1, 0)) patch.set_edgecolor((1, 1, 1, 0)) def main() -> None: FIGURES.mkdir(parents=True, exist_ok=True) DATA.mkdir(parents=True, exist_ok=True) remote = ROOT / "raw" / "remote_benchmarks" bfcl_scores = ROOT / "raw" / "bfcl_scores" base_ifeval_path = first_result(remote, "ifeval_base/*/results_*.json") adapter_ifeval_path = first_result(remote, "ifeval_adapter/*/results_*.json") base_ifeval = load_json(base_ifeval_path)["results"]["ifeval"] adapter_ifeval = load_json(adapter_ifeval_path)["results"]["ifeval"] validation_summary = load_json(remote / "qwen36_carnice_benchmark_summary_20260425.json")[ "training_format_validation" ] bfcl_base = load_json( bfcl_scores / "qwen36-base-local-FC" / "multi_turn" / "BFCL_v4_multi_turn_base_score.json" ) bfcl_adapter_path = ( bfcl_scores / "qwen36-carnice-v1-local-FC" / "multi_turn" / "BFCL_v4_multi_turn_base_score.json" ) bfcl_adapter = json.loads( next(line for line in bfcl_adapter_path.read_text(encoding="utf-8").splitlines() if line.strip()) ) metrics = { "run": "qwen36_short_public_ab_20260425_155339", "model": { "base": "Qwen/Qwen3.6-27B", "carnice_sft": "qwen36_carnice_direct_v1b_lora_8192_split_200step", }, "note": "All plotted values are raw measured values from the included benchmark files.", "ifeval_limit_20": { "base": { "prompt_strict": base_ifeval["prompt_level_strict_acc,none"], "prompt_loose": base_ifeval["prompt_level_loose_acc,none"], "instruction_strict": base_ifeval["inst_level_strict_acc,none"], "instruction_loose": base_ifeval["inst_level_loose_acc,none"], }, "carnice_sft": { "prompt_strict": adapter_ifeval["prompt_level_strict_acc,none"], "prompt_loose": adapter_ifeval["prompt_level_loose_acc,none"], "instruction_strict": adapter_ifeval["inst_level_strict_acc,none"], "instruction_loose": adapter_ifeval["inst_level_loose_acc,none"], }, }, "heldout_training_format_validation": validation_summary, "bfcl_multi_turn_base_limit_2": { "base": bfcl_base, "carnice_sft": { "accuracy": bfcl_adapter["accuracy"], "correct_count": bfcl_adapter["correct_count"], "total_count": bfcl_adapter["total_count"], }, }, "source_files": { "ifeval_base": str(base_ifeval_path.relative_to(ROOT)), "ifeval_carnice_sft": str(adapter_ifeval_path.relative_to(ROOT)), "bfcl_scores": "raw/bfcl_scores/", "validation": "raw/remote_benchmarks/qwen36_carnice_benchmark_summary_20260425.json", }, } (DATA / "metrics.json").write_text(json.dumps(metrics, indent=2) + "\n", encoding="utf-8") labels = [ ("Prompt strict", "prompt_strict"), ("Prompt loose", "prompt_loose"), ("Instruction strict", "instruction_strict"), ("Instruction loose", "instruction_loose"), ] base_vals = [metrics["ifeval_limit_20"]["base"][key] for _, key in labels] carnice_vals = [metrics["ifeval_limit_20"]["carnice_sft"][key] for _, key in labels] base_loss = validation_summary["base_eval_loss"] carnice_loss = validation_summary["adapter_eval_loss"] base_ppl = validation_summary["base_eval_perplexity"] carnice_ppl = validation_summary["adapter_eval_perplexity"] loss_reduction = (base_loss - carnice_loss) / base_loss ppl_reduction = (base_ppl - carnice_ppl) / base_ppl plt.rcParams.update( { "font.family": "DejaVu Sans", "figure.facecolor": "#ffffff", "axes.facecolor": "#ffffff", "savefig.facecolor": "#ffffff", "text.color": "#0f1115", "axes.labelcolor": "#0f1115", "xtick.color": "#0f1115", "ytick.color": "#4b5563", "axes.edgecolor": "#a6a6a6", } ) fig = plt.figure(figsize=(12.93, 6.55), dpi=200) left = [0.060, 0.21, 0.60, 0.66] right = [0.735, 0.29, 0.235, 0.56] for x, y, w, h in [(0.018, 0.04, 0.66, 0.92), (0.705, 0.04, 0.275, 0.92)]: fig.add_artist( FancyBboxPatch( (x, y), w, h, boxstyle="round,pad=0.016,rounding_size=0.025", transform=fig.transFigure, linewidth=1.0, edgecolor="#e4e7ec", facecolor="#ffffff", zorder=-10, ) ) ax = fig.add_axes(left) x = np.arange(len(labels)) * 1.55 width = 0.46 base_bars = ax.bar(x - width / 2, base_vals, width=width, color="#d8c6ef", zorder=3) carnice_bars = ax.bar(x + width / 2, carnice_vals, width=width, color="#cbd9f7", zorder=3) for patch in base_bars: add_gradient_bar(ax, patch, "#d8c5ef", "#efe7fa") for patch in carnice_bars: add_gradient_bar(ax, patch, "#c8d7f6", "#eef4ff") ax.set_xlim(x[0] - 0.70, x[-1] + 0.70) ax.set_ylim(0.75, 1.005) yticks = np.arange(0.75, 1.01, 0.05) ax.set_yticks(yticks) ax.set_yticklabels([f"{int(round(v * 100))}%" for v in yticks], fontsize=10) ax.grid(axis="y", color="#dfe3ea", linewidth=1.0, linestyle=(0, (2.2, 2.2)), zorder=0) ax.spines[["top", "right", "left"]].set_visible(False) ax.spines["bottom"].set_color("#9ca3af") ax.tick_params(axis="y", length=0, pad=8) ax.tick_params(axis="x", length=0, pad=10) tick_positions = [] tick_labels = [] for i, (label, _) in enumerate(labels): tick_positions.extend([x[i] - width / 2, x[i] + width / 2]) tick_labels.extend(["Base", "Carnice SFT"]) ax.text(x[i], 0.718, label, ha="center", va="top", fontsize=13, clip_on=False) ax.set_xticks(tick_positions) ax.set_xticklabels(tick_labels, fontsize=9) for bars, values in [(base_bars, base_vals), (carnice_bars, carnice_vals)]: for patch, value in zip(bars, values): ax.text( patch.get_x() + patch.get_width() / 2, value + 0.006, pct(value), ha="center", va="bottom", fontsize=13, fontweight="bold", color="#0f1115", ) ax2 = fig.add_axes(right) reductions = [loss_reduction, ppl_reduction] reduction_x = [0, 1.35] bars = ax2.bar(reduction_x, reductions, width=0.68, color=["#d8c5ef", "#ffb0a2"], zorder=3) add_gradient_bar(ax2, bars[0], "#d8c5ef", "#efe7fa") add_gradient_bar(ax2, bars[1], "#ffaaa0", "#ffd9d0") ax2.set_xlim(-0.70, 2.05) ax2.set_ylim(0, 0.38) ax2.axis("off") for i, (bar, value, label) in enumerate( zip(bars, reductions, ["Loss\nreduction", "Perplexity\nreduction"]) ): x_pos = reduction_x[i] ax2.text(x_pos, value + 0.017, pct(value), ha="center", va="bottom", fontsize=17, fontweight="bold") ax2.text(x_pos, -0.017, label, ha="center", va="top", fontsize=11, clip_on=False) fig.text(0.845, 0.145, f"Validation loss: {base_loss:.3f} \u2192 {carnice_loss:.3f}", ha="center", va="center", fontsize=10, color="#667085") fig.text(0.845, 0.092, f"Validation perplexity: {base_ppl:.3f} \u2192 {carnice_ppl:.3f}", ha="center", va="center", fontsize=10, color="#667085") for path in [FIGURES / "qwen36_carnice_sft_benchmark_card.png", FIGURES / "qwen36_carnice_sft_benchmark_card.svg"]: fig.savefig(path) print(FIGURES / "qwen36_carnice_sft_benchmark_card.png") if __name__ == "__main__": main()