Carnice-V2-27b / benchmarks /scripts /make_benchmark_card.py
kai-os's picture
Add files using upload-large-folder tool
31a7782 verified
from __future__ import annotations
import json
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.patches import FancyBboxPatch
ROOT = Path(__file__).resolve().parents[1]
FIGURES = ROOT / "figures"
DATA = ROOT / "data"
def load_json(path: Path) -> dict:
return json.loads(path.read_text(encoding="utf-8"))
def first_result(root: Path, pattern: str) -> Path:
matches = sorted(root.glob(pattern))
if not matches:
raise FileNotFoundError(pattern)
return matches[0]
def pct(value: float) -> str:
return f"{value * 100:.1f}%"
def add_gradient_bar(ax, patch, top_color: str, bottom_color: str) -> None:
x, y = patch.get_x(), patch.get_y()
w, h = patch.get_width(), patch.get_height()
if h <= 0:
return
def rgb(hex_color: str) -> np.ndarray:
hex_color = hex_color.lstrip("#")
return np.array([int(hex_color[i : i + 2], 16) / 255 for i in (0, 2, 4)])
top = rgb(top_color)
bottom = rgb(bottom_color)
gradient = np.linspace(bottom, top, 256).reshape(256, 1, 3)
ax.imshow(
gradient,
extent=[x, x + w, y, y + h],
origin="lower",
aspect="auto",
clip_path=patch,
clip_on=True,
zorder=patch.get_zorder() + 0.1,
)
patch.set_facecolor((1, 1, 1, 0))
patch.set_edgecolor((1, 1, 1, 0))
def main() -> None:
FIGURES.mkdir(parents=True, exist_ok=True)
DATA.mkdir(parents=True, exist_ok=True)
remote = ROOT / "raw" / "remote_benchmarks"
bfcl_scores = ROOT / "raw" / "bfcl_scores"
base_ifeval_path = first_result(remote, "ifeval_base/*/results_*.json")
adapter_ifeval_path = first_result(remote, "ifeval_adapter/*/results_*.json")
base_ifeval = load_json(base_ifeval_path)["results"]["ifeval"]
adapter_ifeval = load_json(adapter_ifeval_path)["results"]["ifeval"]
validation_summary = load_json(remote / "qwen36_carnice_benchmark_summary_20260425.json")[
"training_format_validation"
]
bfcl_base = load_json(
bfcl_scores / "qwen36-base-local-FC" / "multi_turn" / "BFCL_v4_multi_turn_base_score.json"
)
bfcl_adapter_path = (
bfcl_scores
/ "qwen36-carnice-v1-local-FC"
/ "multi_turn"
/ "BFCL_v4_multi_turn_base_score.json"
)
bfcl_adapter = json.loads(
next(line for line in bfcl_adapter_path.read_text(encoding="utf-8").splitlines() if line.strip())
)
metrics = {
"run": "qwen36_short_public_ab_20260425_155339",
"model": {
"base": "Qwen/Qwen3.6-27B",
"carnice_sft": "qwen36_carnice_direct_v1b_lora_8192_split_200step",
},
"note": "All plotted values are raw measured values from the included benchmark files.",
"ifeval_limit_20": {
"base": {
"prompt_strict": base_ifeval["prompt_level_strict_acc,none"],
"prompt_loose": base_ifeval["prompt_level_loose_acc,none"],
"instruction_strict": base_ifeval["inst_level_strict_acc,none"],
"instruction_loose": base_ifeval["inst_level_loose_acc,none"],
},
"carnice_sft": {
"prompt_strict": adapter_ifeval["prompt_level_strict_acc,none"],
"prompt_loose": adapter_ifeval["prompt_level_loose_acc,none"],
"instruction_strict": adapter_ifeval["inst_level_strict_acc,none"],
"instruction_loose": adapter_ifeval["inst_level_loose_acc,none"],
},
},
"heldout_training_format_validation": validation_summary,
"bfcl_multi_turn_base_limit_2": {
"base": bfcl_base,
"carnice_sft": {
"accuracy": bfcl_adapter["accuracy"],
"correct_count": bfcl_adapter["correct_count"],
"total_count": bfcl_adapter["total_count"],
},
},
"source_files": {
"ifeval_base": str(base_ifeval_path.relative_to(ROOT)),
"ifeval_carnice_sft": str(adapter_ifeval_path.relative_to(ROOT)),
"bfcl_scores": "raw/bfcl_scores/",
"validation": "raw/remote_benchmarks/qwen36_carnice_benchmark_summary_20260425.json",
},
}
(DATA / "metrics.json").write_text(json.dumps(metrics, indent=2) + "\n", encoding="utf-8")
labels = [
("Prompt strict", "prompt_strict"),
("Prompt loose", "prompt_loose"),
("Instruction strict", "instruction_strict"),
("Instruction loose", "instruction_loose"),
]
base_vals = [metrics["ifeval_limit_20"]["base"][key] for _, key in labels]
carnice_vals = [metrics["ifeval_limit_20"]["carnice_sft"][key] for _, key in labels]
base_loss = validation_summary["base_eval_loss"]
carnice_loss = validation_summary["adapter_eval_loss"]
base_ppl = validation_summary["base_eval_perplexity"]
carnice_ppl = validation_summary["adapter_eval_perplexity"]
loss_reduction = (base_loss - carnice_loss) / base_loss
ppl_reduction = (base_ppl - carnice_ppl) / base_ppl
plt.rcParams.update(
{
"font.family": "DejaVu Sans",
"figure.facecolor": "#ffffff",
"axes.facecolor": "#ffffff",
"savefig.facecolor": "#ffffff",
"text.color": "#0f1115",
"axes.labelcolor": "#0f1115",
"xtick.color": "#0f1115",
"ytick.color": "#4b5563",
"axes.edgecolor": "#a6a6a6",
}
)
fig = plt.figure(figsize=(12.93, 6.55), dpi=200)
left = [0.060, 0.21, 0.60, 0.66]
right = [0.735, 0.29, 0.235, 0.56]
for x, y, w, h in [(0.018, 0.04, 0.66, 0.92), (0.705, 0.04, 0.275, 0.92)]:
fig.add_artist(
FancyBboxPatch(
(x, y),
w,
h,
boxstyle="round,pad=0.016,rounding_size=0.025",
transform=fig.transFigure,
linewidth=1.0,
edgecolor="#e4e7ec",
facecolor="#ffffff",
zorder=-10,
)
)
ax = fig.add_axes(left)
x = np.arange(len(labels)) * 1.55
width = 0.46
base_bars = ax.bar(x - width / 2, base_vals, width=width, color="#d8c6ef", zorder=3)
carnice_bars = ax.bar(x + width / 2, carnice_vals, width=width, color="#cbd9f7", zorder=3)
for patch in base_bars:
add_gradient_bar(ax, patch, "#d8c5ef", "#efe7fa")
for patch in carnice_bars:
add_gradient_bar(ax, patch, "#c8d7f6", "#eef4ff")
ax.set_xlim(x[0] - 0.70, x[-1] + 0.70)
ax.set_ylim(0.75, 1.005)
yticks = np.arange(0.75, 1.01, 0.05)
ax.set_yticks(yticks)
ax.set_yticklabels([f"{int(round(v * 100))}%" for v in yticks], fontsize=10)
ax.grid(axis="y", color="#dfe3ea", linewidth=1.0, linestyle=(0, (2.2, 2.2)), zorder=0)
ax.spines[["top", "right", "left"]].set_visible(False)
ax.spines["bottom"].set_color("#9ca3af")
ax.tick_params(axis="y", length=0, pad=8)
ax.tick_params(axis="x", length=0, pad=10)
tick_positions = []
tick_labels = []
for i, (label, _) in enumerate(labels):
tick_positions.extend([x[i] - width / 2, x[i] + width / 2])
tick_labels.extend(["Base", "Carnice SFT"])
ax.text(x[i], 0.718, label, ha="center", va="top", fontsize=13, clip_on=False)
ax.set_xticks(tick_positions)
ax.set_xticklabels(tick_labels, fontsize=9)
for bars, values in [(base_bars, base_vals), (carnice_bars, carnice_vals)]:
for patch, value in zip(bars, values):
ax.text(
patch.get_x() + patch.get_width() / 2,
value + 0.006,
pct(value),
ha="center",
va="bottom",
fontsize=13,
fontweight="bold",
color="#0f1115",
)
ax2 = fig.add_axes(right)
reductions = [loss_reduction, ppl_reduction]
reduction_x = [0, 1.35]
bars = ax2.bar(reduction_x, reductions, width=0.68, color=["#d8c5ef", "#ffb0a2"], zorder=3)
add_gradient_bar(ax2, bars[0], "#d8c5ef", "#efe7fa")
add_gradient_bar(ax2, bars[1], "#ffaaa0", "#ffd9d0")
ax2.set_xlim(-0.70, 2.05)
ax2.set_ylim(0, 0.38)
ax2.axis("off")
for i, (bar, value, label) in enumerate(
zip(bars, reductions, ["Loss\nreduction", "Perplexity\nreduction"])
):
x_pos = reduction_x[i]
ax2.text(x_pos, value + 0.017, pct(value), ha="center", va="bottom", fontsize=17, fontweight="bold")
ax2.text(x_pos, -0.017, label, ha="center", va="top", fontsize=11, clip_on=False)
fig.text(0.845, 0.145, f"Validation loss: {base_loss:.3f} \u2192 {carnice_loss:.3f}",
ha="center", va="center", fontsize=10, color="#667085")
fig.text(0.845, 0.092, f"Validation perplexity: {base_ppl:.3f} \u2192 {carnice_ppl:.3f}",
ha="center", va="center", fontsize=10, color="#667085")
for path in [FIGURES / "qwen36_carnice_sft_benchmark_card.png", FIGURES / "qwen36_carnice_sft_benchmark_card.svg"]:
fig.savefig(path)
print(FIGURES / "qwen36_carnice_sft_benchmark_card.png")
if __name__ == "__main__":
main()