{ "run": "qwen36_short_public_ab_20260425_155339", "model": { "base": "Qwen/Qwen3.6-27B", "carnice_sft": "qwen36_carnice_direct_v1b_lora_8192_split_200step" }, "note": "All plotted values are raw measured values from the included benchmark files.", "ifeval_limit_20": { "base": { "prompt_strict": 0.85, "prompt_loose": 0.85, "instruction_strict": 0.9, "instruction_loose": 0.9 }, "carnice_sft": { "prompt_strict": 0.9, "prompt_loose": 0.9, "instruction_strict": 0.9333333333333333, "instruction_loose": 0.9333333333333333 } }, "heldout_training_format_validation": { "base_eval_loss": 0.6070852279663086, "adapter_eval_loss": 0.4140348732471466, "base_eval_perplexity": 1.8350747712770092, "adapter_eval_perplexity": 1.5129098860448142, "loss_reduction_pct": 31.799547382476533, "perplexity_reduction_pct": 17.55595413738939, "eval_examples": 110, "eval_rows_in": 64, "metric_note": "Exact training-format assistant-only validation metric from train_qwen36_carnice_unsloth.py, not external agent benchmark." }, "bfcl_multi_turn_base_limit_2": { "base": { "accuracy": 1.0, "correct_count": 2, "total_count": 2 }, "carnice_sft": { "accuracy": 0.0, "correct_count": 0, "total_count": 2 } }, "source_files": { "ifeval_base": "raw/remote_benchmarks/ifeval_base/Qwen__Qwen3.6-27B/results_2026-04-25T16-37-26.991493.json", "ifeval_carnice_sft": "raw/remote_benchmarks/ifeval_adapter/__home__ubuntu__hermes-glm5-stagea-pilot__outputs__qwen36_carnice_direct_v1b_lora_8192_split_200step__adapter/results_2026-04-25T16-25-02.636087.json", "bfcl_scores": "raw/bfcl_scores/", "validation": "raw/remote_benchmarks/qwen36_carnice_benchmark_summary_20260425.json" } }