| { |
| "run": "qwen36_short_public_ab_20260425_155339", |
| "model": { |
| "base": "Qwen/Qwen3.6-27B", |
| "carnice_sft": "qwen36_carnice_direct_v1b_lora_8192_split_200step" |
| }, |
| "note": "All plotted values are raw measured values from the included benchmark files.", |
| "ifeval_limit_20": { |
| "base": { |
| "prompt_strict": 0.85, |
| "prompt_loose": 0.85, |
| "instruction_strict": 0.9, |
| "instruction_loose": 0.9 |
| }, |
| "carnice_sft": { |
| "prompt_strict": 0.9, |
| "prompt_loose": 0.9, |
| "instruction_strict": 0.9333333333333333, |
| "instruction_loose": 0.9333333333333333 |
| } |
| }, |
| "heldout_training_format_validation": { |
| "base_eval_loss": 0.6070852279663086, |
| "adapter_eval_loss": 0.4140348732471466, |
| "base_eval_perplexity": 1.8350747712770092, |
| "adapter_eval_perplexity": 1.5129098860448142, |
| "loss_reduction_pct": 31.799547382476533, |
| "perplexity_reduction_pct": 17.55595413738939, |
| "eval_examples": 110, |
| "eval_rows_in": 64, |
| "metric_note": "Exact training-format assistant-only validation metric from train_qwen36_carnice_unsloth.py, not external agent benchmark." |
| }, |
| "bfcl_multi_turn_base_limit_2": { |
| "base": { |
| "accuracy": 1.0, |
| "correct_count": 2, |
| "total_count": 2 |
| }, |
| "carnice_sft": { |
| "accuracy": 0.0, |
| "correct_count": 0, |
| "total_count": 2 |
| } |
| }, |
| "source_files": { |
| "ifeval_base": "raw/remote_benchmarks/ifeval_base/Qwen__Qwen3.6-27B/results_2026-04-25T16-37-26.991493.json", |
| "ifeval_carnice_sft": "raw/remote_benchmarks/ifeval_adapter/__home__ubuntu__hermes-glm5-stagea-pilot__outputs__qwen36_carnice_direct_v1b_lora_8192_split_200step__adapter/results_2026-04-25T16-25-02.636087.json", |
| "bfcl_scores": "raw/bfcl_scores/", |
| "validation": "raw/remote_benchmarks/qwen36_carnice_benchmark_summary_20260425.json" |
| } |
| } |
|
|