File size: 1,852 Bytes
31a7782
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
{
  "run": "qwen36_short_public_ab_20260425_155339",
  "model": {
    "base": "Qwen/Qwen3.6-27B",
    "carnice_sft": "qwen36_carnice_direct_v1b_lora_8192_split_200step"
  },
  "note": "All plotted values are raw measured values from the included benchmark files.",
  "ifeval_limit_20": {
    "base": {
      "prompt_strict": 0.85,
      "prompt_loose": 0.85,
      "instruction_strict": 0.9,
      "instruction_loose": 0.9
    },
    "carnice_sft": {
      "prompt_strict": 0.9,
      "prompt_loose": 0.9,
      "instruction_strict": 0.9333333333333333,
      "instruction_loose": 0.9333333333333333
    }
  },
  "heldout_training_format_validation": {
    "base_eval_loss": 0.6070852279663086,
    "adapter_eval_loss": 0.4140348732471466,
    "base_eval_perplexity": 1.8350747712770092,
    "adapter_eval_perplexity": 1.5129098860448142,
    "loss_reduction_pct": 31.799547382476533,
    "perplexity_reduction_pct": 17.55595413738939,
    "eval_examples": 110,
    "eval_rows_in": 64,
    "metric_note": "Exact training-format assistant-only validation metric from train_qwen36_carnice_unsloth.py, not external agent benchmark."
  },
  "bfcl_multi_turn_base_limit_2": {
    "base": {
      "accuracy": 1.0,
      "correct_count": 2,
      "total_count": 2
    },
    "carnice_sft": {
      "accuracy": 0.0,
      "correct_count": 0,
      "total_count": 2
    }
  },
  "source_files": {
    "ifeval_base": "raw/remote_benchmarks/ifeval_base/Qwen__Qwen3.6-27B/results_2026-04-25T16-37-26.991493.json",
    "ifeval_carnice_sft": "raw/remote_benchmarks/ifeval_adapter/__home__ubuntu__hermes-glm5-stagea-pilot__outputs__qwen36_carnice_direct_v1b_lora_8192_split_200step__adapter/results_2026-04-25T16-25-02.636087.json",
    "bfcl_scores": "raw/bfcl_scores/",
    "validation": "raw/remote_benchmarks/qwen36_carnice_benchmark_summary_20260425.json"
  }
}