Carnice-V2-27b / benchmarks /data /metrics.json
kai-os's picture
Add files using upload-large-folder tool
31a7782 verified
{
"run": "qwen36_short_public_ab_20260425_155339",
"model": {
"base": "Qwen/Qwen3.6-27B",
"carnice_sft": "qwen36_carnice_direct_v1b_lora_8192_split_200step"
},
"note": "All plotted values are raw measured values from the included benchmark files.",
"ifeval_limit_20": {
"base": {
"prompt_strict": 0.85,
"prompt_loose": 0.85,
"instruction_strict": 0.9,
"instruction_loose": 0.9
},
"carnice_sft": {
"prompt_strict": 0.9,
"prompt_loose": 0.9,
"instruction_strict": 0.9333333333333333,
"instruction_loose": 0.9333333333333333
}
},
"heldout_training_format_validation": {
"base_eval_loss": 0.6070852279663086,
"adapter_eval_loss": 0.4140348732471466,
"base_eval_perplexity": 1.8350747712770092,
"adapter_eval_perplexity": 1.5129098860448142,
"loss_reduction_pct": 31.799547382476533,
"perplexity_reduction_pct": 17.55595413738939,
"eval_examples": 110,
"eval_rows_in": 64,
"metric_note": "Exact training-format assistant-only validation metric from train_qwen36_carnice_unsloth.py, not external agent benchmark."
},
"bfcl_multi_turn_base_limit_2": {
"base": {
"accuracy": 1.0,
"correct_count": 2,
"total_count": 2
},
"carnice_sft": {
"accuracy": 0.0,
"correct_count": 0,
"total_count": 2
}
},
"source_files": {
"ifeval_base": "raw/remote_benchmarks/ifeval_base/Qwen__Qwen3.6-27B/results_2026-04-25T16-37-26.991493.json",
"ifeval_carnice_sft": "raw/remote_benchmarks/ifeval_adapter/__home__ubuntu__hermes-glm5-stagea-pilot__outputs__qwen36_carnice_direct_v1b_lora_8192_split_200step__adapter/results_2026-04-25T16-25-02.636087.json",
"bfcl_scores": "raw/bfcl_scores/",
"validation": "raw/remote_benchmarks/qwen36_carnice_benchmark_summary_20260425.json"
}
}