| from __future__ import annotations |
|
|
| import json |
| from pathlib import Path |
|
|
| import matplotlib.pyplot as plt |
| import numpy as np |
| from matplotlib.patches import FancyBboxPatch |
|
|
|
|
| ROOT = Path(__file__).resolve().parents[1] |
| FIGURES = ROOT / "figures" |
| DATA = ROOT / "data" |
|
|
|
|
| def load_json(path: Path) -> dict: |
| return json.loads(path.read_text(encoding="utf-8")) |
|
|
|
|
| def first_result(root: Path, pattern: str) -> Path: |
| matches = sorted(root.glob(pattern)) |
| if not matches: |
| raise FileNotFoundError(pattern) |
| return matches[0] |
|
|
|
|
| def pct(value: float) -> str: |
| return f"{value * 100:.1f}%" |
|
|
|
|
| def add_gradient_bar(ax, patch, top_color: str, bottom_color: str) -> None: |
| x, y = patch.get_x(), patch.get_y() |
| w, h = patch.get_width(), patch.get_height() |
| if h <= 0: |
| return |
|
|
| def rgb(hex_color: str) -> np.ndarray: |
| hex_color = hex_color.lstrip("#") |
| return np.array([int(hex_color[i : i + 2], 16) / 255 for i in (0, 2, 4)]) |
|
|
| top = rgb(top_color) |
| bottom = rgb(bottom_color) |
| gradient = np.linspace(bottom, top, 256).reshape(256, 1, 3) |
| ax.imshow( |
| gradient, |
| extent=[x, x + w, y, y + h], |
| origin="lower", |
| aspect="auto", |
| clip_path=patch, |
| clip_on=True, |
| zorder=patch.get_zorder() + 0.1, |
| ) |
| patch.set_facecolor((1, 1, 1, 0)) |
| patch.set_edgecolor((1, 1, 1, 0)) |
|
|
|
|
| def main() -> None: |
| FIGURES.mkdir(parents=True, exist_ok=True) |
| DATA.mkdir(parents=True, exist_ok=True) |
|
|
| remote = ROOT / "raw" / "remote_benchmarks" |
| bfcl_scores = ROOT / "raw" / "bfcl_scores" |
|
|
| base_ifeval_path = first_result(remote, "ifeval_base/*/results_*.json") |
| adapter_ifeval_path = first_result(remote, "ifeval_adapter/*/results_*.json") |
|
|
| base_ifeval = load_json(base_ifeval_path)["results"]["ifeval"] |
| adapter_ifeval = load_json(adapter_ifeval_path)["results"]["ifeval"] |
| validation_summary = load_json(remote / "qwen36_carnice_benchmark_summary_20260425.json")[ |
| "training_format_validation" |
| ] |
|
|
| bfcl_base = load_json( |
| bfcl_scores / "qwen36-base-local-FC" / "multi_turn" / "BFCL_v4_multi_turn_base_score.json" |
| ) |
| bfcl_adapter_path = ( |
| bfcl_scores |
| / "qwen36-carnice-v1-local-FC" |
| / "multi_turn" |
| / "BFCL_v4_multi_turn_base_score.json" |
| ) |
| bfcl_adapter = json.loads( |
| next(line for line in bfcl_adapter_path.read_text(encoding="utf-8").splitlines() if line.strip()) |
| ) |
|
|
| metrics = { |
| "run": "qwen36_short_public_ab_20260425_155339", |
| "model": { |
| "base": "Qwen/Qwen3.6-27B", |
| "carnice_sft": "qwen36_carnice_direct_v1b_lora_8192_split_200step", |
| }, |
| "note": "All plotted values are raw measured values from the included benchmark files.", |
| "ifeval_limit_20": { |
| "base": { |
| "prompt_strict": base_ifeval["prompt_level_strict_acc,none"], |
| "prompt_loose": base_ifeval["prompt_level_loose_acc,none"], |
| "instruction_strict": base_ifeval["inst_level_strict_acc,none"], |
| "instruction_loose": base_ifeval["inst_level_loose_acc,none"], |
| }, |
| "carnice_sft": { |
| "prompt_strict": adapter_ifeval["prompt_level_strict_acc,none"], |
| "prompt_loose": adapter_ifeval["prompt_level_loose_acc,none"], |
| "instruction_strict": adapter_ifeval["inst_level_strict_acc,none"], |
| "instruction_loose": adapter_ifeval["inst_level_loose_acc,none"], |
| }, |
| }, |
| "heldout_training_format_validation": validation_summary, |
| "bfcl_multi_turn_base_limit_2": { |
| "base": bfcl_base, |
| "carnice_sft": { |
| "accuracy": bfcl_adapter["accuracy"], |
| "correct_count": bfcl_adapter["correct_count"], |
| "total_count": bfcl_adapter["total_count"], |
| }, |
| }, |
| "source_files": { |
| "ifeval_base": str(base_ifeval_path.relative_to(ROOT)), |
| "ifeval_carnice_sft": str(adapter_ifeval_path.relative_to(ROOT)), |
| "bfcl_scores": "raw/bfcl_scores/", |
| "validation": "raw/remote_benchmarks/qwen36_carnice_benchmark_summary_20260425.json", |
| }, |
| } |
| (DATA / "metrics.json").write_text(json.dumps(metrics, indent=2) + "\n", encoding="utf-8") |
|
|
| labels = [ |
| ("Prompt strict", "prompt_strict"), |
| ("Prompt loose", "prompt_loose"), |
| ("Instruction strict", "instruction_strict"), |
| ("Instruction loose", "instruction_loose"), |
| ] |
| base_vals = [metrics["ifeval_limit_20"]["base"][key] for _, key in labels] |
| carnice_vals = [metrics["ifeval_limit_20"]["carnice_sft"][key] for _, key in labels] |
|
|
| base_loss = validation_summary["base_eval_loss"] |
| carnice_loss = validation_summary["adapter_eval_loss"] |
| base_ppl = validation_summary["base_eval_perplexity"] |
| carnice_ppl = validation_summary["adapter_eval_perplexity"] |
| loss_reduction = (base_loss - carnice_loss) / base_loss |
| ppl_reduction = (base_ppl - carnice_ppl) / base_ppl |
|
|
| plt.rcParams.update( |
| { |
| "font.family": "DejaVu Sans", |
| "figure.facecolor": "#ffffff", |
| "axes.facecolor": "#ffffff", |
| "savefig.facecolor": "#ffffff", |
| "text.color": "#0f1115", |
| "axes.labelcolor": "#0f1115", |
| "xtick.color": "#0f1115", |
| "ytick.color": "#4b5563", |
| "axes.edgecolor": "#a6a6a6", |
| } |
| ) |
|
|
| fig = plt.figure(figsize=(12.93, 6.55), dpi=200) |
| left = [0.060, 0.21, 0.60, 0.66] |
| right = [0.735, 0.29, 0.235, 0.56] |
|
|
| for x, y, w, h in [(0.018, 0.04, 0.66, 0.92), (0.705, 0.04, 0.275, 0.92)]: |
| fig.add_artist( |
| FancyBboxPatch( |
| (x, y), |
| w, |
| h, |
| boxstyle="round,pad=0.016,rounding_size=0.025", |
| transform=fig.transFigure, |
| linewidth=1.0, |
| edgecolor="#e4e7ec", |
| facecolor="#ffffff", |
| zorder=-10, |
| ) |
| ) |
|
|
| ax = fig.add_axes(left) |
| x = np.arange(len(labels)) * 1.55 |
| width = 0.46 |
| base_bars = ax.bar(x - width / 2, base_vals, width=width, color="#d8c6ef", zorder=3) |
| carnice_bars = ax.bar(x + width / 2, carnice_vals, width=width, color="#cbd9f7", zorder=3) |
| for patch in base_bars: |
| add_gradient_bar(ax, patch, "#d8c5ef", "#efe7fa") |
| for patch in carnice_bars: |
| add_gradient_bar(ax, patch, "#c8d7f6", "#eef4ff") |
|
|
| ax.set_xlim(x[0] - 0.70, x[-1] + 0.70) |
| ax.set_ylim(0.75, 1.005) |
| yticks = np.arange(0.75, 1.01, 0.05) |
| ax.set_yticks(yticks) |
| ax.set_yticklabels([f"{int(round(v * 100))}%" for v in yticks], fontsize=10) |
| ax.grid(axis="y", color="#dfe3ea", linewidth=1.0, linestyle=(0, (2.2, 2.2)), zorder=0) |
| ax.spines[["top", "right", "left"]].set_visible(False) |
| ax.spines["bottom"].set_color("#9ca3af") |
| ax.tick_params(axis="y", length=0, pad=8) |
| ax.tick_params(axis="x", length=0, pad=10) |
|
|
| tick_positions = [] |
| tick_labels = [] |
| for i, (label, _) in enumerate(labels): |
| tick_positions.extend([x[i] - width / 2, x[i] + width / 2]) |
| tick_labels.extend(["Base", "Carnice SFT"]) |
| ax.text(x[i], 0.718, label, ha="center", va="top", fontsize=13, clip_on=False) |
| ax.set_xticks(tick_positions) |
| ax.set_xticklabels(tick_labels, fontsize=9) |
|
|
| for bars, values in [(base_bars, base_vals), (carnice_bars, carnice_vals)]: |
| for patch, value in zip(bars, values): |
| ax.text( |
| patch.get_x() + patch.get_width() / 2, |
| value + 0.006, |
| pct(value), |
| ha="center", |
| va="bottom", |
| fontsize=13, |
| fontweight="bold", |
| color="#0f1115", |
| ) |
|
|
| ax2 = fig.add_axes(right) |
| reductions = [loss_reduction, ppl_reduction] |
| reduction_x = [0, 1.35] |
| bars = ax2.bar(reduction_x, reductions, width=0.68, color=["#d8c5ef", "#ffb0a2"], zorder=3) |
| add_gradient_bar(ax2, bars[0], "#d8c5ef", "#efe7fa") |
| add_gradient_bar(ax2, bars[1], "#ffaaa0", "#ffd9d0") |
| ax2.set_xlim(-0.70, 2.05) |
| ax2.set_ylim(0, 0.38) |
| ax2.axis("off") |
|
|
| for i, (bar, value, label) in enumerate( |
| zip(bars, reductions, ["Loss\nreduction", "Perplexity\nreduction"]) |
| ): |
| x_pos = reduction_x[i] |
| ax2.text(x_pos, value + 0.017, pct(value), ha="center", va="bottom", fontsize=17, fontweight="bold") |
| ax2.text(x_pos, -0.017, label, ha="center", va="top", fontsize=11, clip_on=False) |
|
|
| fig.text(0.845, 0.145, f"Validation loss: {base_loss:.3f} \u2192 {carnice_loss:.3f}", |
| ha="center", va="center", fontsize=10, color="#667085") |
| fig.text(0.845, 0.092, f"Validation perplexity: {base_ppl:.3f} \u2192 {carnice_ppl:.3f}", |
| ha="center", va="center", fontsize=10, color="#667085") |
|
|
| for path in [FIGURES / "qwen36_carnice_sft_benchmark_card.png", FIGURES / "qwen36_carnice_sft_benchmark_card.svg"]: |
| fig.savefig(path) |
|
|
| print(FIGURES / "qwen36_carnice_sft_benchmark_card.png") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|