HaiwenXia commited on
Commit
87136b4
·
verified ·
1 Parent(s): 2a15825

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. 20260124_2143/config.yaml +142 -0
  2. 20260124_2143/reward_model/1769262210.5061178/events.out.tfevents.1769262210.MACLAB-S004.2626926.1 +3 -0
  3. 20260124_2143/reward_model/1769262210.5078583/hparams.yml +4 -0
  4. 20260124_2143/reward_model/events.out.tfevents.1769262210.MACLAB-S004.2626926.0 +3 -0
  5. 20260124_2143/train.20260124_2143.log +803 -0
  6. 20260124_2354/config.yaml +142 -0
  7. 20260124_2354/reward_model/1769270104.0081618/events.out.tfevents.1769270104.MACLAB-S004.3211506.1 +3 -0
  8. 20260124_2354/reward_model/1769270104.0091846/hparams.yml +4 -0
  9. 20260124_2354/reward_model/events.out.tfevents.1769270104.MACLAB-S004.3211506.0 +3 -0
  10. 20260124_2354/train.20260124_2354.log +306 -0
  11. 20260125_0035/config.yaml +142 -0
  12. 20260125_0035/reward_model/1769272544.7198617/events.out.tfevents.1769272544.MACLAB-S004.3403711.1 +3 -0
  13. 20260125_0035/reward_model/1769272544.7213397/hparams.yml +4 -0
  14. 20260125_0035/reward_model/events.out.tfevents.1769272544.MACLAB-S004.3403711.0 +3 -0
  15. 20260125_0035/train.20260125_0035.log +421 -0
  16. 20260125_0037/config.yaml +142 -0
  17. 20260125_0037/eval_results_0125_1713.jsonl +0 -0
  18. 20260125_0037/reward_model/1769272678.832529/events.out.tfevents.1769272678.MACLAB-S004.3414271.1 +3 -0
  19. 20260125_0037/reward_model/1769272678.8337765/hparams.yml +4 -0
  20. 20260125_0037/reward_model/events.out.tfevents.1769272678.MACLAB-S004.3414271.0 +3 -0
  21. 20260125_0037/train.20260125_0037.log +421 -0
  22. 20260125_0038/config.yaml +142 -0
  23. 20260125_0038/reward_model/1769272741.4481056/events.out.tfevents.1769272741.MACLAB-S004.3419169.1 +3 -0
  24. 20260125_0038/reward_model/1769272741.4495451/hparams.yml +4 -0
  25. 20260125_0038/reward_model/events.out.tfevents.1769272741.MACLAB-S004.3419169.0 +3 -0
  26. 20260125_0038/train.20260125_0038.log +211 -0
  27. 20260125_0933/config.yaml +142 -0
  28. 20260125_0933/reward_model/1769304848.6545663/events.out.tfevents.1769304848.MACLAB-S004.1519845.1 +3 -0
  29. 20260125_0933/reward_model/1769304848.6563416/hparams.yml +4 -0
  30. 20260125_0933/reward_model/events.out.tfevents.1769304848.MACLAB-S004.1519845.0 +3 -0
  31. 20260125_0933/train.20260125_0933.log +564 -0
  32. 20260125_0947_CA/config.yaml +142 -0
  33. 20260125_0947_CA/eval_results_0125_1703.jsonl +0 -0
  34. 20260125_0947_CA/reward_model/1769305674.1033533/events.out.tfevents.1769305674.MACLAB-S004.1592070.1 +3 -0
  35. 20260125_0947_CA/reward_model/1769305674.1053352/hparams.yml +4 -0
  36. 20260125_0947_CA/reward_model/events.out.tfevents.1769305674.MACLAB-S004.1592070.0 +3 -0
  37. 20260125_0947_CA/train.20260125_0947_CA.log +438 -0
  38. 20260125_1117/config.yaml +142 -0
  39. 20260125_1117/reward_model/1769311084.1305242/events.out.tfevents.1769311084.MACLAB-S004.2009526.1 +3 -0
  40. 20260125_1117/reward_model/1769311084.1322424/hparams.yml +4 -0
  41. 20260125_1117/reward_model/events.out.tfevents.1769311084.MACLAB-S004.2009526.0 +3 -0
  42. 20260125_1117/train.20260125_1117.log +441 -0
  43. 20260125_1231/config.yaml +142 -0
  44. 20260125_1231/eval_results_0125_1707.jsonl +0 -0
  45. 20260125_1231/reward_model/1769315504.5030606/events.out.tfevents.1769315504.MACLAB-S004.2360364.1 +3 -0
  46. 20260125_1231/reward_model/1769315504.5045948/hparams.yml +4 -0
  47. 20260125_1231/reward_model/events.out.tfevents.1769315504.MACLAB-S004.2360364.0 +3 -0
  48. 20260125_1231/test_20260125_191012_reward_model.best_4499/test_results.json +51 -0
  49. 20260125_1231/test_20260125_194533_reward_model.best_4499/test_config.yaml +142 -0
  50. 20260125_1231/test_20260125_194533_reward_model.best_4499/test_results.json +239 -0
20260124_2143/config.yaml ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ DEVICES: '3'
2
+ accelerate:
3
+ mixed_precision: bf16
4
+ basics:
5
+ random_seed: 42
6
+ save_dir: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model
7
+ dataset:
8
+ audio_dropout:
9
+ apply_to_eval: false
10
+ apply_to_ref: true
11
+ enabled: false
12
+ eval_only_on_training: true
13
+ max_duration: 1500
14
+ min_duration: 1500
15
+ train_mode: start
16
+ cache_dir: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/tmp
17
+ db_path: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/backend/database.db
18
+ duration: 600.0
19
+ embedding_dir: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/supervised_embeddings
20
+ max_samples: null
21
+ max_val_samples: null
22
+ metadata_jsonl: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/CMI-Training/all_comparisons.jsonl
23
+ mode: raw_text_frozen_audio
24
+ preference_file: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/CMI-Training/human_annotations/train.json
25
+ sample_rate: 24000
26
+ val_preference_file: null
27
+ loss:
28
+ IF_ratio: 0.5
29
+ filter_ties: true
30
+ label_smoothing: 0.0
31
+ reduction: mean
32
+ model:
33
+ attention_mode: SA
34
+ attn_dropout: 0.0
35
+ category_embeddings: null
36
+ dim: 768
37
+ dim_head: 64
38
+ downsample:
39
+ configs:
40
+ conv2_4x:
41
+ factor: 4
42
+ kernel_size: 5
43
+ kind: conv*2
44
+ use_layernorm: true
45
+ conv_4x:
46
+ factor: 4
47
+ kernel_size: 5
48
+ kind: conv
49
+ stage: 1
50
+ use_layernorm: true
51
+ glu_4x:
52
+ factor: 4
53
+ kernel_size: 5
54
+ kind: gluconv*2+pw
55
+ use_layernorm: true
56
+ mean:
57
+ factor: 2
58
+ kind: mean
59
+ mean_4x:
60
+ dropout: 0.0
61
+ factor: 30
62
+ kind: mean+mlp
63
+ mlp_ratio: 2.0
64
+ none:
65
+ factor: 1
66
+ kind: none
67
+ eval: mean_4x
68
+ ref: null
69
+ text: none
70
+ ff_dropout: 0.0
71
+ ff_mult: 4
72
+ freeze_audio: true
73
+ freeze_text: true
74
+ gradient_checkpointing: false
75
+ heads: 8
76
+ joint_tf_depth: 1
77
+ load_config:
78
+ checkpoint_path: null
79
+ frozen_from_pretrained: true
80
+ pretrained_name: OpenMuQ/MuQ-MuLan-large
81
+ strict: false
82
+ mlp_dim: 768
83
+ mode: concat_text_late
84
+ model_name: OpenMuQ/MuQ-MuLan-large
85
+ name: reward
86
+ no_condition: false
87
+ null_embedding:
88
+ audio:
89
+ dropout: 0.5
90
+ length: 10
91
+ lyrics:
92
+ dropout: 0.3
93
+ length: 10
94
+ text:
95
+ dropout: 0
96
+ length: 10
97
+ output_dim: 2
98
+ prompt_tf_depth: 4
99
+ sr: 24000
100
+ text_encoder:
101
+ name: muq_mulan
102
+ tune: null
103
+ text_lora_config: null
104
+ train_muq_depth: 0
105
+ train_muqmulan: false
106
+ use_audio: true
107
+ use_layer_idx: -1
108
+ project_root: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena
109
+ run_name: null
110
+ train:
111
+ batch_size: 48
112
+ betas:
113
+ - 0.9
114
+ - 0.99
115
+ ema_decay: 0.9999
116
+ ema_update_every: 1
117
+ enable_gradient_checkpointing: true
118
+ force_clear_prev_results: false
119
+ grad_accum_every: 1
120
+ log_tensorboard: true
121
+ lr_schedule:
122
+ min_lr_ratio: 0.001
123
+ name: linear_cosine
124
+ total_steps: 4000
125
+ warmup_steps: 300
126
+ max_grad_norm: 1
127
+ mlp_lr: 0.0001
128
+ num_train_steps: 4000
129
+ num_valid_batches: null
130
+ num_workers: 8
131
+ other_lr: 1.0e-05
132
+ resume: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_0147/ckpt/reward_model.best_29999.pt
133
+ resume_optimizer: false
134
+ save_model_every: 2000
135
+ use_checkpoint_config: true
136
+ use_ema: true
137
+ use_lion: false
138
+ valid_batch_size: 20
139
+ valid_every: 100
140
+ valid_frac: 0.1
141
+ verify_weights_on_load: true
142
+ validate_only: false
20260124_2143/reward_model/1769262210.5061178/events.out.tfevents.1769262210.MACLAB-S004.2626926.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82ee766b07252644d7045f50ffd3d29ed1cbc0b26a834bdb1d855c526f959108
3
+ size 503
20260124_2143/reward_model/1769262210.5078583/hparams.yml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ batch_size: 48
2
+ grad_accum_every: 1
3
+ learning_rate: 0.0001
4
+ num_train_steps: 4000
20260124_2143/reward_model/events.out.tfevents.1769262210.MACLAB-S004.2626926.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:361130a96e5393eb1f50a4f818c47547a16295e3f01976ce0e9113e0a561cf68
3
+ size 2219689
20260124_2143/train.20260124_2143.log ADDED
@@ -0,0 +1,803 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-24 21:43:19 | INFO | Log file: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/train.20260124_2143.log
2
+ 2026-01-24 21:43:19 | INFO | Random seed set to 42
3
+ 2026-01-24 21:43:21 | INFO | Created RawTextFrozenAudioDataset with 3463 samples
4
+ 2026-01-24 21:43:21 | INFO | Split dataset into train (3117) and validation (346) sets (ratio: 10.00%)
5
+ 2026-01-24 21:43:21 | INFO | Will resume from checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_0147/ckpt/reward_model.best_29999.pt
6
+ 2026-01-24 21:43:21 | INFO | Using checkpoint config for model initialization (continue training mode)
7
+ 2026-01-24 21:43:29 | INFO | Created RewardAttentionModel with attention_mode=SA
8
+ 2026-01-24 21:43:29 | INFO | Created PreferenceLoss with filter_ties=True
9
+ 2026-01-24 21:43:29 | INFO | ✓ Gradient checkpointing enabled
10
+ 2026-01-24 21:43:29 | INFO | ✓ EMA enabled with decay=0.9999, update_every=1 (CPU offload)
11
+ 2026-01-24 21:43:29 | INFO | MLP head parameters: 1,186,563 params, lr=0.0001
12
+ 2026-01-24 21:43:29 | INFO | Other parameters: 37,397,634 params, lr=1e-05
13
+ 2026-01-24 21:43:29 | INFO | Using lr_schedule=linear_cosine warmup_steps=300 total_steps=4000
14
+ 2026-01-24 21:43:29 | INFO | Training with fixed validation set
15
+ 2026-01-24 21:43:29 | INFO | Train batch_size: 48, Valid batch_size: 20
16
+ 2026-01-24 21:43:29 | INFO | Missing keys (782): ['text_module.model.embeddings.word_embeddings.weight', 'text_module.model.embeddings.position_embeddings.weight', 'text_module.model.embeddings.token_type_embeddings.weight', 'text_module.model.embeddings.LayerNorm.weight', 'text_module.model.embeddings.LayerNorm.bias']...
17
+ 2026-01-24 21:43:29 | INFO | ✓ EMA state loaded
18
+ 2026-01-24 21:43:29 | INFO | ✓ Starting from step 0 (transfer learning mode, ignoring checkpoint steps=29999)
19
+ 2026-01-24 21:43:29 | INFO | Resumed from /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_0147/ckpt/reward_model.best_29999.pt
20
+ 2026-01-24 21:43:29 | INFO | Parameters: 701.162M total, 38.584M trainable
21
+ 2026-01-24 21:43:29 | INFO | Text encoder (frozen): 328.389M
22
+ 2026-01-24 21:43:29 | INFO | Audio encoder (frozen): 334.189M
23
+ 2026-01-24 21:43:29 | INFO | Other trainable: 38.584M
24
+ 2026-01-24 21:43:29 | INFO | ℹ No LoRA configuration detected
25
+ 2026-01-24 21:43:30 | INFO | ============================================================
26
+ 2026-01-24 21:43:30 | INFO | Ready to start training
27
+ 2026-01-24 21:43:30 | INFO | ============================================================
28
+ 2026-01-24 21:43:30 | INFO | Starting training from step 0
29
+ 2026-01-24 21:43:30 | INFO | ===== Accelerator / CUDA Debug Info =====
30
+ 2026-01-24 21:43:30 | INFO | accelerator.device = cuda
31
+ 2026-01-24 21:43:30 | INFO | mixed_precision = bf16
32
+ 2026-01-24 21:43:30 | INFO | distributed_type = NO
33
+ 2026-01-24 21:43:30 | INFO | num_processes = 1
34
+ 2026-01-24 21:43:30 | INFO | process_index = 0
35
+ 2026-01-24 21:43:30 | INFO | is_main_process = True
36
+ 2026-01-24 21:43:30 | INFO | torch.cuda.is_available() = True
37
+ 2026-01-24 21:43:30 | INFO | torch.cuda.device_count() = 1
38
+ 2026-01-24 21:43:30 | INFO | current_device = 0
39
+ 2026-01-24 21:43:30 | INFO | device_name = NVIDIA GeForce RTX 4090
40
+ 2026-01-24 21:43:30 | INFO | model parameter device = cuda:0
41
+ 2026-01-24 21:43:30 | INFO | Training for 4000.0 steps (~63 epochs, 64 steps/epoch)
42
+ 2026-01-24 21:43:38 | INFO | Step 0: loss=1.6133 | IF_loss=2.2461, MQ_loss=0.9805 | acc=0.740 (IF=0.708, MQ=0.771) | lr=0.000001
43
+ 2026-01-24 21:43:38 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.0.pt (filtered to 38.584M trainable parameters)
44
+ 2026-01-24 21:43:39 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.0.pt (575.2MB)
45
+ 2026-01-24 21:43:39 | INFO | Step 0: Saved to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.0.pt
46
+ 2026-01-24 21:45:32 | INFO |
47
+ ============================================================
48
+ Validation Results (took 9.56s):
49
+ Samples: 346 instruction, 346 quality
50
+ Instruction Acc: 0.6821
51
+ Quality Acc: 0.6387
52
+ Average Acc: 0.6604
53
+ Total Loss: 1.8726
54
+ Instruction Loss: 1.6586
55
+ Quality Loss: 2.0866
56
+ ============================================================
57
+ 2026-01-24 21:45:32 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_99.pt (filtered to 38.584M trainable parameters)
58
+ 2026-01-24 21:45:33 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_99.pt (575.2MB)
59
+ 2026-01-24 21:45:33 | INFO | Best 1 checkpoints:
60
+ 2026-01-24 21:45:33 | INFO | 1. Step 99: acc=0.6604 (reward_model.best_99.pt)
61
+ 2026-01-24 21:45:34 | INFO | Step 100: loss=1.5309 | IF_loss=1.2373, MQ_loss=1.8246 | acc=0.646 (IF=0.688, MQ=0.604) | lr=0.000034
62
+ 2026-01-24 21:47:29 | INFO |
63
+ ============================================================
64
+ Validation Results (took 8.11s):
65
+ Samples: 346 instruction, 346 quality
66
+ Instruction Acc: 0.6850
67
+ Quality Acc: 0.6387
68
+ Average Acc: 0.6618
69
+ Total Loss: 1.8631
70
+ Instruction Loss: 1.6525
71
+ Quality Loss: 2.0736
72
+ ============================================================
73
+ 2026-01-24 21:47:29 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_199.pt (filtered to 38.584M trainable parameters)
74
+ 2026-01-24 21:47:30 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_199.pt (575.2MB)
75
+ 2026-01-24 21:47:30 | INFO | Best 2 checkpoints:
76
+ 2026-01-24 21:47:30 | INFO | 1. Step 199: acc=0.6618 (reward_model.best_199.pt)
77
+ 2026-01-24 21:47:30 | INFO | 2. Step 99: acc=0.6604 (reward_model.best_99.pt)
78
+ 2026-01-24 21:47:31 | INFO | Step 200: loss=0.4360 | IF_loss=0.4299, MQ_loss=0.4421 | acc=0.833 (IF=0.812, MQ=0.854) | lr=0.000067
79
+ 2026-01-24 21:49:25 | INFO |
80
+ ============================================================
81
+ Validation Results (took 9.42s):
82
+ Samples: 346 instruction, 346 quality
83
+ Instruction Acc: 0.6850
84
+ Quality Acc: 0.6387
85
+ Average Acc: 0.6618
86
+ Total Loss: 1.8438
87
+ Instruction Loss: 1.6364
88
+ Quality Loss: 2.0512
89
+ ============================================================
90
+ 2026-01-24 21:49:25 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_299.pt (filtered to 38.584M trainable parameters)
91
+ 2026-01-24 21:49:25 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_299.pt (575.2MB)
92
+ 2026-01-24 21:49:25 | INFO | Best 3 checkpoints:
93
+ 2026-01-24 21:49:25 | INFO | 1. Step 199: acc=0.6618 (reward_model.best_199.pt)
94
+ 2026-01-24 21:49:25 | INFO | 2. Step 299: acc=0.6618 (reward_model.best_299.pt)
95
+ 2026-01-24 21:49:25 | INFO | 3. Step 99: acc=0.6604 (reward_model.best_99.pt)
96
+ 2026-01-24 21:49:26 | INFO | Step 300: loss=0.4121 | IF_loss=0.5007, MQ_loss=0.3235 | acc=0.844 (IF=0.792, MQ=0.896) | lr=0.000100
97
+ 2026-01-24 21:51:23 | INFO |
98
+ ============================================================
99
+ Validation Results (took 7.32s):
100
+ Samples: 346 instruction, 346 quality
101
+ Instruction Acc: 0.6850
102
+ Quality Acc: 0.6387
103
+ Average Acc: 0.6618
104
+ Total Loss: 1.8266
105
+ Instruction Loss: 1.6230
106
+ Quality Loss: 2.0303
107
+ ============================================================
108
+ 2026-01-24 21:51:23 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_399.pt (filtered to 38.584M trainable parameters)
109
+ 2026-01-24 21:51:24 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_399.pt (575.2MB)
110
+ 2026-01-24 21:51:24 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_99.pt
111
+ 2026-01-24 21:51:24 | INFO | Best 3 checkpoints:
112
+ 2026-01-24 21:51:24 | INFO | 1. Step 199: acc=0.6618 (reward_model.best_199.pt)
113
+ 2026-01-24 21:51:24 | INFO | 2. Step 299: acc=0.6618 (reward_model.best_299.pt)
114
+ 2026-01-24 21:51:24 | INFO | 3. Step 399: acc=0.6618 (reward_model.best_399.pt)
115
+ 2026-01-24 21:51:25 | INFO | Step 400: loss=0.4819 | IF_loss=0.4988, MQ_loss=0.4650 | acc=0.760 (IF=0.708, MQ=0.812) | lr=0.000100
116
+ 2026-01-24 21:53:18 | INFO |
117
+ ============================================================
118
+ Validation Results (took 8.30s):
119
+ Samples: 346 instruction, 346 quality
120
+ Instruction Acc: 0.6821
121
+ Quality Acc: 0.6416
122
+ Average Acc: 0.6618
123
+ Total Loss: 1.8103
124
+ Instruction Loss: 1.6100
125
+ Quality Loss: 2.0107
126
+ ============================================================
127
+ 2026-01-24 21:53:18 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_499.pt (filtered to 38.584M trainable parameters)
128
+ 2026-01-24 21:53:19 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_499.pt (575.2MB)
129
+ 2026-01-24 21:53:19 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_499.pt
130
+ 2026-01-24 21:53:19 | INFO | Best 3 checkpoints:
131
+ 2026-01-24 21:53:19 | INFO | 1. Step 199: acc=0.6618 (reward_model.best_199.pt)
132
+ 2026-01-24 21:53:19 | INFO | 2. Step 299: acc=0.6618 (reward_model.best_299.pt)
133
+ 2026-01-24 21:53:19 | INFO | 3. Step 399: acc=0.6618 (reward_model.best_399.pt)
134
+ 2026-01-24 21:53:20 | INFO | Step 500: loss=0.4074 | IF_loss=0.4939, MQ_loss=0.3209 | acc=0.854 (IF=0.792, MQ=0.917) | lr=0.000099
135
+ 2026-01-24 21:55:17 | INFO |
136
+ ============================================================
137
+ Validation Results (took 7.55s):
138
+ Samples: 346 instruction, 346 quality
139
+ Instruction Acc: 0.6821
140
+ Quality Acc: 0.6416
141
+ Average Acc: 0.6618
142
+ Total Loss: 1.7951
143
+ Instruction Loss: 1.5986
144
+ Quality Loss: 1.9916
145
+ ============================================================
146
+ 2026-01-24 21:55:17 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_599.pt (filtered to 38.584M trainable parameters)
147
+ 2026-01-24 21:55:17 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_599.pt (575.2MB)
148
+ 2026-01-24 21:55:17 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_599.pt
149
+ 2026-01-24 21:55:17 | INFO | Best 3 checkpoints:
150
+ 2026-01-24 21:55:17 | INFO | 1. Step 199: acc=0.6618 (reward_model.best_199.pt)
151
+ 2026-01-24 21:55:17 | INFO | 2. Step 299: acc=0.6618 (reward_model.best_299.pt)
152
+ 2026-01-24 21:55:17 | INFO | 3. Step 399: acc=0.6618 (reward_model.best_399.pt)
153
+ 2026-01-24 21:55:18 | INFO | Step 600: loss=0.3505 | IF_loss=0.3784, MQ_loss=0.3226 | acc=0.844 (IF=0.812, MQ=0.875) | lr=0.000098
154
+ 2026-01-24 21:57:14 | INFO |
155
+ ============================================================
156
+ Validation Results (took 7.89s):
157
+ Samples: 346 instruction, 346 quality
158
+ Instruction Acc: 0.6821
159
+ Quality Acc: 0.6445
160
+ Average Acc: 0.6633
161
+ Total Loss: 1.7807
162
+ Instruction Loss: 1.5876
163
+ Quality Loss: 1.9739
164
+ ============================================================
165
+ 2026-01-24 21:57:14 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_699.pt (filtered to 38.584M trainable parameters)
166
+ 2026-01-24 21:57:14 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_699.pt (575.2MB)
167
+ 2026-01-24 21:57:14 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_399.pt
168
+ 2026-01-24 21:57:14 | INFO | Best 3 checkpoints:
169
+ 2026-01-24 21:57:14 | INFO | 1. Step 699: acc=0.6633 (reward_model.best_699.pt)
170
+ 2026-01-24 21:57:14 | INFO | 2. Step 199: acc=0.6618 (reward_model.best_199.pt)
171
+ 2026-01-24 21:57:14 | INFO | 3. Step 299: acc=0.6618 (reward_model.best_299.pt)
172
+ 2026-01-24 21:57:15 | INFO | Step 700: loss=0.2439 | IF_loss=0.3054, MQ_loss=0.1823 | acc=0.875 (IF=0.854, MQ=0.896) | lr=0.000097
173
+ 2026-01-24 21:59:13 | INFO |
174
+ ============================================================
175
+ Validation Results (took 7.71s):
176
+ Samples: 346 instruction, 346 quality
177
+ Instruction Acc: 0.6821
178
+ Quality Acc: 0.6474
179
+ Average Acc: 0.6647
180
+ Total Loss: 1.7686
181
+ Instruction Loss: 1.5780
182
+ Quality Loss: 1.9591
183
+ ============================================================
184
+ 2026-01-24 21:59:13 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_799.pt (filtered to 38.584M trainable parameters)
185
+ 2026-01-24 21:59:13 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_799.pt (575.2MB)
186
+ 2026-01-24 21:59:13 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_299.pt
187
+ 2026-01-24 21:59:13 | INFO | Best 3 checkpoints:
188
+ 2026-01-24 21:59:13 | INFO | 1. Step 799: acc=0.6647 (reward_model.best_799.pt)
189
+ 2026-01-24 21:59:13 | INFO | 2. Step 699: acc=0.6633 (reward_model.best_699.pt)
190
+ 2026-01-24 21:59:13 | INFO | 3. Step 199: acc=0.6618 (reward_model.best_199.pt)
191
+ 2026-01-24 21:59:14 | INFO | Step 800: loss=0.2827 | IF_loss=0.3525, MQ_loss=0.2128 | acc=0.885 (IF=0.875, MQ=0.896) | lr=0.000096
192
+ 2026-01-24 22:01:11 | INFO |
193
+ ============================================================
194
+ Validation Results (took 7.05s):
195
+ Samples: 346 instruction, 346 quality
196
+ Instruction Acc: 0.6850
197
+ Quality Acc: 0.6474
198
+ Average Acc: 0.6662
199
+ Total Loss: 1.7570
200
+ Instruction Loss: 1.5693
201
+ Quality Loss: 1.9446
202
+ ============================================================
203
+ 2026-01-24 22:01:11 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_899.pt (filtered to 38.584M trainable parameters)
204
+ 2026-01-24 22:01:12 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_899.pt (575.2MB)
205
+ 2026-01-24 22:01:12 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_199.pt
206
+ 2026-01-24 22:01:12 | INFO | Best 3 checkpoints:
207
+ 2026-01-24 22:01:12 | INFO | 1. Step 899: acc=0.6662 (reward_model.best_899.pt)
208
+ 2026-01-24 22:01:12 | INFO | 2. Step 799: acc=0.6647 (reward_model.best_799.pt)
209
+ 2026-01-24 22:01:12 | INFO | 3. Step 699: acc=0.6633 (reward_model.best_699.pt)
210
+ 2026-01-24 22:01:13 | INFO | Step 900: loss=0.1525 | IF_loss=0.1838, MQ_loss=0.1212 | acc=0.958 (IF=0.958, MQ=0.958) | lr=0.000094
211
+ 2026-01-24 22:03:07 | INFO |
212
+ ============================================================
213
+ Validation Results (took 7.74s):
214
+ Samples: 346 instruction, 346 quality
215
+ Instruction Acc: 0.6821
216
+ Quality Acc: 0.6474
217
+ Average Acc: 0.6647
218
+ Total Loss: 1.7472
219
+ Instruction Loss: 1.5625
220
+ Quality Loss: 1.9319
221
+ ============================================================
222
+ 2026-01-24 22:03:07 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_999.pt (filtered to 38.584M trainable parameters)
223
+ 2026-01-24 22:03:08 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_999.pt (575.2MB)
224
+ 2026-01-24 22:03:08 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_699.pt
225
+ 2026-01-24 22:03:08 | INFO | Best 3 checkpoints:
226
+ 2026-01-24 22:03:08 | INFO | 1. Step 899: acc=0.6662 (reward_model.best_899.pt)
227
+ 2026-01-24 22:03:08 | INFO | 2. Step 799: acc=0.6647 (reward_model.best_799.pt)
228
+ 2026-01-24 22:03:08 | INFO | 3. Step 999: acc=0.6647 (reward_model.best_999.pt)
229
+ 2026-01-24 22:03:09 | INFO | Step 1000: loss=0.1671 | IF_loss=0.1673, MQ_loss=0.1668 | acc=0.969 (IF=0.979, MQ=0.958) | lr=0.000091
230
+ 2026-01-24 22:05:04 | INFO |
231
+ ============================================================
232
+ Validation Results (took 6.94s):
233
+ Samples: 346 instruction, 346 quality
234
+ Instruction Acc: 0.6850
235
+ Quality Acc: 0.6474
236
+ Average Acc: 0.6662
237
+ Total Loss: 1.7380
238
+ Instruction Loss: 1.5555
239
+ Quality Loss: 1.9205
240
+ ============================================================
241
+ 2026-01-24 22:05:04 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_1099.pt (filtered to 38.584M trainable parameters)
242
+ 2026-01-24 22:05:04 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_1099.pt (575.2MB)
243
+ 2026-01-24 22:05:04 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_999.pt
244
+ 2026-01-24 22:05:04 | INFO | Best 3 checkpoints:
245
+ 2026-01-24 22:05:04 | INFO | 1. Step 899: acc=0.6662 (reward_model.best_899.pt)
246
+ 2026-01-24 22:05:04 | INFO | 2. Step 1099: acc=0.6662 (reward_model.best_1099.pt)
247
+ 2026-01-24 22:05:04 | INFO | 3. Step 799: acc=0.6647 (reward_model.best_799.pt)
248
+ 2026-01-24 22:05:05 | INFO | Step 1100: loss=0.1267 | IF_loss=0.1381, MQ_loss=0.1154 | acc=0.948 (IF=0.958, MQ=0.938) | lr=0.000089
249
+ 2026-01-24 22:07:02 | INFO |
250
+ ============================================================
251
+ Validation Results (took 7.34s):
252
+ Samples: 346 instruction, 346 quality
253
+ Instruction Acc: 0.6850
254
+ Quality Acc: 0.6416
255
+ Average Acc: 0.6633
256
+ Total Loss: 1.7320
257
+ Instruction Loss: 1.5520
258
+ Quality Loss: 1.9119
259
+ ============================================================
260
+ 2026-01-24 22:07:02 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_1199.pt (filtered to 38.584M trainable parameters)
261
+ 2026-01-24 22:07:03 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_1199.pt (575.2MB)
262
+ 2026-01-24 22:07:03 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_1199.pt
263
+ 2026-01-24 22:07:03 | INFO | Best 3 checkpoints:
264
+ 2026-01-24 22:07:03 | INFO | 1. Step 899: acc=0.6662 (reward_model.best_899.pt)
265
+ 2026-01-24 22:07:03 | INFO | 2. Step 1099: acc=0.6662 (reward_model.best_1099.pt)
266
+ 2026-01-24 22:07:03 | INFO | 3. Step 799: acc=0.6647 (reward_model.best_799.pt)
267
+ 2026-01-24 22:07:04 | INFO | Step 1200: loss=0.1201 | IF_loss=0.1744, MQ_loss=0.0657 | acc=0.948 (IF=0.917, MQ=0.979) | lr=0.000086
268
+ 2026-01-24 22:08:59 | INFO |
269
+ ============================================================
270
+ Validation Results (took 7.61s):
271
+ Samples: 346 instruction, 346 quality
272
+ Instruction Acc: 0.6850
273
+ Quality Acc: 0.6416
274
+ Average Acc: 0.6633
275
+ Total Loss: 1.7259
276
+ Instruction Loss: 1.5481
277
+ Quality Loss: 1.9036
278
+ ============================================================
279
+ 2026-01-24 22:08:59 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_1299.pt (filtered to 38.584M trainable parameters)
280
+ 2026-01-24 22:09:00 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_1299.pt (575.2MB)
281
+ 2026-01-24 22:09:00 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_1299.pt
282
+ 2026-01-24 22:09:00 | INFO | Best 3 checkpoints:
283
+ 2026-01-24 22:09:00 | INFO | 1. Step 899: acc=0.6662 (reward_model.best_899.pt)
284
+ 2026-01-24 22:09:00 | INFO | 2. Step 1099: acc=0.6662 (reward_model.best_1099.pt)
285
+ 2026-01-24 22:09:00 | INFO | 3. Step 799: acc=0.6647 (reward_model.best_799.pt)
286
+ 2026-01-24 22:09:01 | INFO | Step 1300: loss=0.0937 | IF_loss=0.1357, MQ_loss=0.0516 | acc=0.958 (IF=0.938, MQ=0.979) | lr=0.000083
287
+ 2026-01-24 22:10:53 | INFO |
288
+ ============================================================
289
+ Validation Results (took 7.24s):
290
+ Samples: 346 instruction, 346 quality
291
+ Instruction Acc: 0.6850
292
+ Quality Acc: 0.6416
293
+ Average Acc: 0.6633
294
+ Total Loss: 1.7217
295
+ Instruction Loss: 1.5459
296
+ Quality Loss: 1.8975
297
+ ============================================================
298
+ 2026-01-24 22:10:53 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_1399.pt (filtered to 38.584M trainable parameters)
299
+ 2026-01-24 22:10:54 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_1399.pt (575.2MB)
300
+ 2026-01-24 22:10:54 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_1399.pt
301
+ 2026-01-24 22:10:54 | INFO | Best 3 checkpoints:
302
+ 2026-01-24 22:10:54 | INFO | 1. Step 899: acc=0.6662 (reward_model.best_899.pt)
303
+ 2026-01-24 22:10:54 | INFO | 2. Step 1099: acc=0.6662 (reward_model.best_1099.pt)
304
+ 2026-01-24 22:10:54 | INFO | 3. Step 799: acc=0.6647 (reward_model.best_799.pt)
305
+ 2026-01-24 22:10:55 | INFO | Step 1400: loss=0.0782 | IF_loss=0.1080, MQ_loss=0.0484 | acc=0.990 (IF=0.979, MQ=1.000) | lr=0.000080
306
+ 2026-01-24 22:12:49 | INFO |
307
+ ============================================================
308
+ Validation Results (took 7.28s):
309
+ Samples: 346 instruction, 346 quality
310
+ Instruction Acc: 0.6908
311
+ Quality Acc: 0.6416
312
+ Average Acc: 0.6662
313
+ Total Loss: 1.7182
314
+ Instruction Loss: 1.5441
315
+ Quality Loss: 1.8922
316
+ ============================================================
317
+ 2026-01-24 22:12:49 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_1499.pt (filtered to 38.584M trainable parameters)
318
+ 2026-01-24 22:12:49 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_1499.pt (575.2MB)
319
+ 2026-01-24 22:12:49 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_799.pt
320
+ 2026-01-24 22:12:49 | INFO | Best 3 checkpoints:
321
+ 2026-01-24 22:12:49 | INFO | 1. Step 899: acc=0.6662 (reward_model.best_899.pt)
322
+ 2026-01-24 22:12:49 | INFO | 2. Step 1099: acc=0.6662 (reward_model.best_1099.pt)
323
+ 2026-01-24 22:12:49 | INFO | 3. Step 1499: acc=0.6662 (reward_model.best_1499.pt)
324
+ 2026-01-24 22:12:50 | INFO | Step 1500: loss=0.0699 | IF_loss=0.0703, MQ_loss=0.0695 | acc=0.979 (IF=0.979, MQ=0.979) | lr=0.000076
325
+ 2026-01-24 22:14:41 | INFO |
326
+ ============================================================
327
+ Validation Results (took 7.76s):
328
+ Samples: 346 instruction, 346 quality
329
+ Instruction Acc: 0.6908
330
+ Quality Acc: 0.6416
331
+ Average Acc: 0.6662
332
+ Total Loss: 1.7151
333
+ Instruction Loss: 1.5435
334
+ Quality Loss: 1.8867
335
+ ============================================================
336
+ 2026-01-24 22:14:41 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_1599.pt (filtered to 38.584M trainable parameters)
337
+ 2026-01-24 22:14:42 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_1599.pt (575.2MB)
338
+ 2026-01-24 22:14:42 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_1599.pt
339
+ 2026-01-24 22:14:42 | INFO | Best 3 checkpoints:
340
+ 2026-01-24 22:14:42 | INFO | 1. Step 899: acc=0.6662 (reward_model.best_899.pt)
341
+ 2026-01-24 22:14:42 | INFO | 2. Step 1099: acc=0.6662 (reward_model.best_1099.pt)
342
+ 2026-01-24 22:14:42 | INFO | 3. Step 1499: acc=0.6662 (reward_model.best_1499.pt)
343
+ 2026-01-24 22:14:46 | INFO | Step 1600: loss=0.0346 | IF_loss=0.0421, MQ_loss=0.0272 | acc=1.000 (IF=1.000, MQ=1.000) | lr=0.000072
344
+ 2026-01-24 22:16:34 | INFO |
345
+ ============================================================
346
+ Validation Results (took 7.04s):
347
+ Samples: 346 instruction, 346 quality
348
+ Instruction Acc: 0.6908
349
+ Quality Acc: 0.6445
350
+ Average Acc: 0.6676
351
+ Total Loss: 1.7117
352
+ Instruction Loss: 1.5434
353
+ Quality Loss: 1.8800
354
+ ============================================================
355
+ 2026-01-24 22:16:34 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_1699.pt (filtered to 38.584M trainable parameters)
356
+ 2026-01-24 22:16:35 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_1699.pt (575.2MB)
357
+ 2026-01-24 22:16:35 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_1499.pt
358
+ 2026-01-24 22:16:35 | INFO | Best 3 checkpoints:
359
+ 2026-01-24 22:16:35 | INFO | 1. Step 1699: acc=0.6676 (reward_model.best_1699.pt)
360
+ 2026-01-24 22:16:35 | INFO | 2. Step 899: acc=0.6662 (reward_model.best_899.pt)
361
+ 2026-01-24 22:16:35 | INFO | 3. Step 1099: acc=0.6662 (reward_model.best_1099.pt)
362
+ 2026-01-24 22:16:36 | INFO | Step 1700: loss=0.0480 | IF_loss=0.0609, MQ_loss=0.0350 | acc=0.990 (IF=0.979, MQ=1.000) | lr=0.000069
363
+ 2026-01-24 22:18:30 | INFO |
364
+ ============================================================
365
+ Validation Results (took 7.06s):
366
+ Samples: 346 instruction, 346 quality
367
+ Instruction Acc: 0.6936
368
+ Quality Acc: 0.6445
369
+ Average Acc: 0.6691
370
+ Total Loss: 1.7110
371
+ Instruction Loss: 1.5436
372
+ Quality Loss: 1.8783
373
+ ============================================================
374
+ 2026-01-24 22:18:30 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_1799.pt (filtered to 38.584M trainable parameters)
375
+ 2026-01-24 22:18:30 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_1799.pt (575.2MB)
376
+ 2026-01-24 22:18:30 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_1099.pt
377
+ 2026-01-24 22:18:30 | INFO | Best 3 checkpoints:
378
+ 2026-01-24 22:18:30 | INFO | 1. Step 1799: acc=0.6691 (reward_model.best_1799.pt)
379
+ 2026-01-24 22:18:30 | INFO | 2. Step 1699: acc=0.6676 (reward_model.best_1699.pt)
380
+ 2026-01-24 22:18:30 | INFO | 3. Step 899: acc=0.6662 (reward_model.best_899.pt)
381
+ 2026-01-24 22:18:32 | INFO | Step 1800: loss=0.0316 | IF_loss=0.0473, MQ_loss=0.0159 | acc=0.990 (IF=0.979, MQ=1.000) | lr=0.000065
382
+ 2026-01-24 22:20:24 | INFO |
383
+ ============================================================
384
+ Validation Results (took 7.18s):
385
+ Samples: 346 instruction, 346 quality
386
+ Instruction Acc: 0.6908
387
+ Quality Acc: 0.6474
388
+ Average Acc: 0.6691
389
+ Total Loss: 1.7090
390
+ Instruction Loss: 1.5445
391
+ Quality Loss: 1.8734
392
+ ============================================================
393
+ 2026-01-24 22:20:24 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_1899.pt (filtered to 38.584M trainable parameters)
394
+ 2026-01-24 22:20:25 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_1899.pt (575.2MB)
395
+ 2026-01-24 22:20:25 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_899.pt
396
+ 2026-01-24 22:20:25 | INFO | Best 3 checkpoints:
397
+ 2026-01-24 22:20:25 | INFO | 1. Step 1799: acc=0.6691 (reward_model.best_1799.pt)
398
+ 2026-01-24 22:20:25 | INFO | 2. Step 1899: acc=0.6691 (reward_model.best_1899.pt)
399
+ 2026-01-24 22:20:25 | INFO | 3. Step 1699: acc=0.6676 (reward_model.best_1699.pt)
400
+ 2026-01-24 22:20:26 | INFO | Step 1900: loss=0.0415 | IF_loss=0.0539, MQ_loss=0.0290 | acc=0.990 (IF=0.979, MQ=1.000) | lr=0.000061
401
+ 2026-01-24 22:22:22 | INFO |
402
+ ============================================================
403
+ Validation Results (took 7.27s):
404
+ Samples: 346 instruction, 346 quality
405
+ Instruction Acc: 0.6936
406
+ Quality Acc: 0.6474
407
+ Average Acc: 0.6705
408
+ Total Loss: 1.7083
409
+ Instruction Loss: 1.5455
410
+ Quality Loss: 1.8711
411
+ ============================================================
412
+ 2026-01-24 22:22:22 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_1999.pt (filtered to 38.584M trainable parameters)
413
+ 2026-01-24 22:22:22 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_1999.pt (575.2MB)
414
+ 2026-01-24 22:22:22 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_1699.pt
415
+ 2026-01-24 22:22:22 | INFO | Best 3 checkpoints:
416
+ 2026-01-24 22:22:22 | INFO | 1. Step 1999: acc=0.6705 (reward_model.best_1999.pt)
417
+ 2026-01-24 22:22:22 | INFO | 2. Step 1799: acc=0.6691 (reward_model.best_1799.pt)
418
+ 2026-01-24 22:22:22 | INFO | 3. Step 1899: acc=0.6691 (reward_model.best_1899.pt)
419
+ 2026-01-24 22:22:23 | INFO | Step 2000: loss=0.0589 | IF_loss=0.0511, MQ_loss=0.0667 | acc=0.979 (IF=0.979, MQ=0.979) | lr=0.000056
420
+ 2026-01-24 22:22:23 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.2000.pt (filtered to 38.584M trainable parameters)
421
+ 2026-01-24 22:22:24 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.2000.pt (575.2MB)
422
+ 2026-01-24 22:22:24 | INFO | Step 2000: Saved to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.2000.pt
423
+ 2026-01-24 22:24:18 | INFO |
424
+ ============================================================
425
+ Validation Results (took 7.25s):
426
+ Samples: 346 instruction, 346 quality
427
+ Instruction Acc: 0.6879
428
+ Quality Acc: 0.6474
429
+ Average Acc: 0.6676
430
+ Total Loss: 1.7086
431
+ Instruction Loss: 1.5472
432
+ Quality Loss: 1.8700
433
+ ============================================================
434
+ 2026-01-24 22:24:18 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_2099.pt (filtered to 38.584M trainable parameters)
435
+ 2026-01-24 22:24:19 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_2099.pt (575.2MB)
436
+ 2026-01-24 22:24:19 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_2099.pt
437
+ 2026-01-24 22:24:19 | INFO | Best 3 checkpoints:
438
+ 2026-01-24 22:24:19 | INFO | 1. Step 1999: acc=0.6705 (reward_model.best_1999.pt)
439
+ 2026-01-24 22:24:19 | INFO | 2. Step 1799: acc=0.6691 (reward_model.best_1799.pt)
440
+ 2026-01-24 22:24:19 | INFO | 3. Step 1899: acc=0.6691 (reward_model.best_1899.pt)
441
+ 2026-01-24 22:24:20 | INFO | Step 2100: loss=0.0284 | IF_loss=0.0286, MQ_loss=0.0281 | acc=0.990 (IF=1.000, MQ=0.979) | lr=0.000052
442
+ 2026-01-24 22:26:12 | INFO |
443
+ ============================================================
444
+ Validation Results (took 7.00s):
445
+ Samples: 346 instruction, 346 quality
446
+ Instruction Acc: 0.6879
447
+ Quality Acc: 0.6503
448
+ Average Acc: 0.6691
449
+ Total Loss: 1.7083
450
+ Instruction Loss: 1.5495
451
+ Quality Loss: 1.8672
452
+ ============================================================
453
+ 2026-01-24 22:26:12 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_2199.pt (filtered to 38.584M trainable parameters)
454
+ 2026-01-24 22:26:13 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_2199.pt (575.2MB)
455
+ 2026-01-24 22:26:13 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_1899.pt
456
+ 2026-01-24 22:26:13 | INFO | Best 3 checkpoints:
457
+ 2026-01-24 22:26:13 | INFO | 1. Step 1999: acc=0.6705 (reward_model.best_1999.pt)
458
+ 2026-01-24 22:26:13 | INFO | 2. Step 2199: acc=0.6691 (reward_model.best_2199.pt)
459
+ 2026-01-24 22:26:13 | INFO | 3. Step 1799: acc=0.6691 (reward_model.best_1799.pt)
460
+ 2026-01-24 22:26:14 | INFO | Step 2200: loss=0.0061 | IF_loss=0.0038, MQ_loss=0.0085 | acc=1.000 (IF=1.000, MQ=1.000) | lr=0.000048
461
+ 2026-01-24 22:28:05 | INFO |
462
+ ============================================================
463
+ Validation Results (took 7.64s):
464
+ Samples: 346 instruction, 346 quality
465
+ Instruction Acc: 0.6879
466
+ Quality Acc: 0.6532
467
+ Average Acc: 0.6705
468
+ Total Loss: 1.7088
469
+ Instruction Loss: 1.5525
470
+ Quality Loss: 1.8651
471
+ ============================================================
472
+ 2026-01-24 22:28:05 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_2299.pt (filtered to 38.584M trainable parameters)
473
+ 2026-01-24 22:28:05 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_2299.pt (575.2MB)
474
+ 2026-01-24 22:28:05 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_1799.pt
475
+ 2026-01-24 22:28:05 | INFO | Best 3 checkpoints:
476
+ 2026-01-24 22:28:05 | INFO | 1. Step 1999: acc=0.6705 (reward_model.best_1999.pt)
477
+ 2026-01-24 22:28:05 | INFO | 2. Step 2299: acc=0.6705 (reward_model.best_2299.pt)
478
+ 2026-01-24 22:28:05 | INFO | 3. Step 2199: acc=0.6691 (reward_model.best_2199.pt)
479
+ 2026-01-24 22:28:06 | INFO | Step 2300: loss=0.0451 | IF_loss=0.0768, MQ_loss=0.0134 | acc=0.990 (IF=0.979, MQ=1.000) | lr=0.000044
480
+ 2026-01-24 22:30:00 | INFO |
481
+ ============================================================
482
+ Validation Results (took 7.30s):
483
+ Samples: 346 instruction, 346 quality
484
+ Instruction Acc: 0.6908
485
+ Quality Acc: 0.6532
486
+ Average Acc: 0.6720
487
+ Total Loss: 1.7079
488
+ Instruction Loss: 1.5530
489
+ Quality Loss: 1.8628
490
+ ============================================================
491
+ 2026-01-24 22:30:00 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_2399.pt (filtered to 38.584M trainable parameters)
492
+ 2026-01-24 22:30:01 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_2399.pt (575.2MB)
493
+ 2026-01-24 22:30:01 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_2199.pt
494
+ 2026-01-24 22:30:01 | INFO | Best 3 checkpoints:
495
+ 2026-01-24 22:30:01 | INFO | 1. Step 2399: acc=0.6720 (reward_model.best_2399.pt)
496
+ 2026-01-24 22:30:01 | INFO | 2. Step 1999: acc=0.6705 (reward_model.best_1999.pt)
497
+ 2026-01-24 22:30:01 | INFO | 3. Step 2299: acc=0.6705 (reward_model.best_2299.pt)
498
+ 2026-01-24 22:30:02 | INFO | Step 2400: loss=0.0141 | IF_loss=0.0160, MQ_loss=0.0122 | acc=1.000 (IF=1.000, MQ=1.000) | lr=0.000039
499
+ 2026-01-24 22:31:52 | INFO |
500
+ ============================================================
501
+ Validation Results (took 6.60s):
502
+ Samples: 346 instruction, 346 quality
503
+ Instruction Acc: 0.6879
504
+ Quality Acc: 0.6503
505
+ Average Acc: 0.6691
506
+ Total Loss: 1.7095
507
+ Instruction Loss: 1.5571
508
+ Quality Loss: 1.8619
509
+ ============================================================
510
+ 2026-01-24 22:31:53 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_2499.pt (filtered to 38.584M trainable parameters)
511
+ 2026-01-24 22:31:53 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_2499.pt (575.2MB)
512
+ 2026-01-24 22:31:53 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_2499.pt
513
+ 2026-01-24 22:31:53 | INFO | Best 3 checkpoints:
514
+ 2026-01-24 22:31:53 | INFO | 1. Step 2399: acc=0.6720 (reward_model.best_2399.pt)
515
+ 2026-01-24 22:31:53 | INFO | 2. Step 1999: acc=0.6705 (reward_model.best_1999.pt)
516
+ 2026-01-24 22:31:53 | INFO | 3. Step 2299: acc=0.6705 (reward_model.best_2299.pt)
517
+ 2026-01-24 22:31:54 | INFO | Step 2500: loss=0.0073 | IF_loss=0.0109, MQ_loss=0.0036 | acc=1.000 (IF=1.000, MQ=1.000) | lr=0.000035
518
+ 2026-01-24 22:33:43 | INFO |
519
+ ============================================================
520
+ Validation Results (took 6.93s):
521
+ Samples: 346 instruction, 346 quality
522
+ Instruction Acc: 0.6879
523
+ Quality Acc: 0.6503
524
+ Average Acc: 0.6691
525
+ Total Loss: 1.7093
526
+ Instruction Loss: 1.5586
527
+ Quality Loss: 1.8601
528
+ ============================================================
529
+ 2026-01-24 22:33:43 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_2599.pt (filtered to 38.584M trainable parameters)
530
+ 2026-01-24 22:33:43 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_2599.pt (575.2MB)
531
+ 2026-01-24 22:33:43 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_2599.pt
532
+ 2026-01-24 22:33:43 | INFO | Best 3 checkpoints:
533
+ 2026-01-24 22:33:43 | INFO | 1. Step 2399: acc=0.6720 (reward_model.best_2399.pt)
534
+ 2026-01-24 22:33:43 | INFO | 2. Step 1999: acc=0.6705 (reward_model.best_1999.pt)
535
+ 2026-01-24 22:33:43 | INFO | 3. Step 2299: acc=0.6705 (reward_model.best_2299.pt)
536
+ 2026-01-24 22:33:44 | INFO | Step 2600: loss=0.0025 | IF_loss=0.0039, MQ_loss=0.0011 | acc=1.000 (IF=1.000, MQ=1.000) | lr=0.000031
537
+ 2026-01-24 22:35:39 | INFO |
538
+ ============================================================
539
+ Validation Results (took 6.71s):
540
+ Samples: 346 instruction, 346 quality
541
+ Instruction Acc: 0.6879
542
+ Quality Acc: 0.6503
543
+ Average Acc: 0.6691
544
+ Total Loss: 1.7105
545
+ Instruction Loss: 1.5632
546
+ Quality Loss: 1.8577
547
+ ============================================================
548
+ 2026-01-24 22:35:39 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_2699.pt (filtered to 38.584M trainable parameters)
549
+ 2026-01-24 22:35:39 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_2699.pt (575.2MB)
550
+ 2026-01-24 22:35:39 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_2699.pt
551
+ 2026-01-24 22:35:39 | INFO | Best 3 checkpoints:
552
+ 2026-01-24 22:35:39 | INFO | 1. Step 2399: acc=0.6720 (reward_model.best_2399.pt)
553
+ 2026-01-24 22:35:39 | INFO | 2. Step 1999: acc=0.6705 (reward_model.best_1999.pt)
554
+ 2026-01-24 22:35:39 | INFO | 3. Step 2299: acc=0.6705 (reward_model.best_2299.pt)
555
+ 2026-01-24 22:35:40 | INFO | Step 2700: loss=0.0285 | IF_loss=0.0436, MQ_loss=0.0134 | acc=0.990 (IF=0.979, MQ=1.000) | lr=0.000028
556
+ 2026-01-24 22:37:31 | INFO |
557
+ ============================================================
558
+ Validation Results (took 7.15s):
559
+ Samples: 346 instruction, 346 quality
560
+ Instruction Acc: 0.6850
561
+ Quality Acc: 0.6503
562
+ Average Acc: 0.6676
563
+ Total Loss: 1.7119
564
+ Instruction Loss: 1.5662
565
+ Quality Loss: 1.8576
566
+ ============================================================
567
+ 2026-01-24 22:37:31 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_2799.pt (filtered to 38.584M trainable parameters)
568
+ 2026-01-24 22:37:32 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_2799.pt (575.2MB)
569
+ 2026-01-24 22:37:32 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_2799.pt
570
+ 2026-01-24 22:37:32 | INFO | Best 3 checkpoints:
571
+ 2026-01-24 22:37:32 | INFO | 1. Step 2399: acc=0.6720 (reward_model.best_2399.pt)
572
+ 2026-01-24 22:37:32 | INFO | 2. Step 1999: acc=0.6705 (reward_model.best_1999.pt)
573
+ 2026-01-24 22:37:32 | INFO | 3. Step 2299: acc=0.6705 (reward_model.best_2299.pt)
574
+ 2026-01-24 22:37:33 | INFO | Step 2800: loss=0.0054 | IF_loss=0.0086, MQ_loss=0.0023 | acc=1.000 (IF=1.000, MQ=1.000) | lr=0.000024
575
+ 2026-01-24 22:39:25 | INFO |
576
+ ============================================================
577
+ Validation Results (took 6.52s):
578
+ Samples: 346 instruction, 346 quality
579
+ Instruction Acc: 0.6879
580
+ Quality Acc: 0.6503
581
+ Average Acc: 0.6691
582
+ Total Loss: 1.7105
583
+ Instruction Loss: 1.5670
584
+ Quality Loss: 1.8540
585
+ ============================================================
586
+ 2026-01-24 22:39:25 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_2899.pt (filtered to 38.584M trainable parameters)
587
+ 2026-01-24 22:39:26 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_2899.pt (575.2MB)
588
+ 2026-01-24 22:39:26 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_2899.pt
589
+ 2026-01-24 22:39:26 | INFO | Best 3 checkpoints:
590
+ 2026-01-24 22:39:26 | INFO | 1. Step 2399: acc=0.6720 (reward_model.best_2399.pt)
591
+ 2026-01-24 22:39:26 | INFO | 2. Step 1999: acc=0.6705 (reward_model.best_1999.pt)
592
+ 2026-01-24 22:39:26 | INFO | 3. Step 2299: acc=0.6705 (reward_model.best_2299.pt)
593
+ 2026-01-24 22:39:27 | INFO | Step 2900: loss=0.0121 | IF_loss=0.0158, MQ_loss=0.0084 | acc=1.000 (IF=1.000, MQ=1.000) | lr=0.000020
594
+ 2026-01-24 22:41:17 | INFO |
595
+ ============================================================
596
+ Validation Results (took 7.44s):
597
+ Samples: 346 instruction, 346 quality
598
+ Instruction Acc: 0.6879
599
+ Quality Acc: 0.6503
600
+ Average Acc: 0.6691
601
+ Total Loss: 1.7130
602
+ Instruction Loss: 1.5717
603
+ Quality Loss: 1.8543
604
+ ============================================================
605
+ 2026-01-24 22:41:17 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_2999.pt (filtered to 38.584M trainable parameters)
606
+ 2026-01-24 22:41:17 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_2999.pt (575.2MB)
607
+ 2026-01-24 22:41:18 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_2999.pt
608
+ 2026-01-24 22:41:18 | INFO | Best 3 checkpoints:
609
+ 2026-01-24 22:41:18 | INFO | 1. Step 2399: acc=0.6720 (reward_model.best_2399.pt)
610
+ 2026-01-24 22:41:18 | INFO | 2. Step 1999: acc=0.6705 (reward_model.best_1999.pt)
611
+ 2026-01-24 22:41:18 | INFO | 3. Step 2299: acc=0.6705 (reward_model.best_2299.pt)
612
+ 2026-01-24 22:41:19 | INFO | Step 3000: loss=0.0040 | IF_loss=0.0024, MQ_loss=0.0055 | acc=1.000 (IF=1.000, MQ=1.000) | lr=0.000017
613
+ 2026-01-24 22:43:12 | INFO |
614
+ ============================================================
615
+ Validation Results (took 6.84s):
616
+ Samples: 346 instruction, 346 quality
617
+ Instruction Acc: 0.6908
618
+ Quality Acc: 0.6503
619
+ Average Acc: 0.6705
620
+ Total Loss: 1.7137
621
+ Instruction Loss: 1.5743
622
+ Quality Loss: 1.8532
623
+ ============================================================
624
+ 2026-01-24 22:43:12 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_3099.pt (filtered to 38.584M trainable parameters)
625
+ 2026-01-24 22:43:12 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_3099.pt (575.2MB)
626
+ 2026-01-24 22:43:12 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_3099.pt
627
+ 2026-01-24 22:43:12 | INFO | Best 3 checkpoints:
628
+ 2026-01-24 22:43:12 | INFO | 1. Step 2399: acc=0.6720 (reward_model.best_2399.pt)
629
+ 2026-01-24 22:43:12 | INFO | 2. Step 1999: acc=0.6705 (reward_model.best_1999.pt)
630
+ 2026-01-24 22:43:12 | INFO | 3. Step 2299: acc=0.6705 (reward_model.best_2299.pt)
631
+ 2026-01-24 22:43:13 | INFO | Step 3100: loss=0.0095 | IF_loss=0.0161, MQ_loss=0.0029 | acc=1.000 (IF=1.000, MQ=1.000) | lr=0.000014
632
+ 2026-01-24 22:45:04 | INFO |
633
+ ============================================================
634
+ Validation Results (took 7.66s):
635
+ Samples: 346 instruction, 346 quality
636
+ Instruction Acc: 0.6879
637
+ Quality Acc: 0.6503
638
+ Average Acc: 0.6691
639
+ Total Loss: 1.7135
640
+ Instruction Loss: 1.5760
641
+ Quality Loss: 1.8510
642
+ ============================================================
643
+ 2026-01-24 22:45:04 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_3199.pt (filtered to 38.584M trainable parameters)
644
+ 2026-01-24 22:45:04 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_3199.pt (575.2MB)
645
+ 2026-01-24 22:45:04 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_3199.pt
646
+ 2026-01-24 22:45:04 | INFO | Best 3 checkpoints:
647
+ 2026-01-24 22:45:04 | INFO | 1. Step 2399: acc=0.6720 (reward_model.best_2399.pt)
648
+ 2026-01-24 22:45:04 | INFO | 2. Step 1999: acc=0.6705 (reward_model.best_1999.pt)
649
+ 2026-01-24 22:45:04 | INFO | 3. Step 2299: acc=0.6705 (reward_model.best_2299.pt)
650
+ 2026-01-24 22:45:08 | INFO | Step 3200: loss=0.0050 | IF_loss=0.0072, MQ_loss=0.0027 | acc=1.000 (IF=1.000, MQ=1.000) | lr=0.000011
651
+ 2026-01-24 22:46:57 | INFO |
652
+ ============================================================
653
+ Validation Results (took 6.84s):
654
+ Samples: 346 instruction, 346 quality
655
+ Instruction Acc: 0.6879
656
+ Quality Acc: 0.6503
657
+ Average Acc: 0.6691
658
+ Total Loss: 1.7154
659
+ Instruction Loss: 1.5809
660
+ Quality Loss: 1.8499
661
+ ============================================================
662
+ 2026-01-24 22:46:57 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_3299.pt (filtered to 38.584M trainable parameters)
663
+ 2026-01-24 22:46:57 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_3299.pt (575.2MB)
664
+ 2026-01-24 22:46:57 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_3299.pt
665
+ 2026-01-24 22:46:57 | INFO | Best 3 checkpoints:
666
+ 2026-01-24 22:46:57 | INFO | 1. Step 2399: acc=0.6720 (reward_model.best_2399.pt)
667
+ 2026-01-24 22:46:57 | INFO | 2. Step 1999: acc=0.6705 (reward_model.best_1999.pt)
668
+ 2026-01-24 22:46:57 | INFO | 3. Step 2299: acc=0.6705 (reward_model.best_2299.pt)
669
+ 2026-01-24 22:46:58 | INFO | Step 3300: loss=0.0362 | IF_loss=0.0503, MQ_loss=0.0221 | acc=0.990 (IF=0.979, MQ=1.000) | lr=0.000009
670
+ 2026-01-24 22:48:50 | INFO |
671
+ ============================================================
672
+ Validation Results (took 6.83s):
673
+ Samples: 346 instruction, 346 quality
674
+ Instruction Acc: 0.6879
675
+ Quality Acc: 0.6532
676
+ Average Acc: 0.6705
677
+ Total Loss: 1.7154
678
+ Instruction Loss: 1.5832
679
+ Quality Loss: 1.8477
680
+ ============================================================
681
+ 2026-01-24 22:48:50 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_3399.pt (filtered to 38.584M trainable parameters)
682
+ 2026-01-24 22:48:51 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_3399.pt (575.2MB)
683
+ 2026-01-24 22:48:51 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_3399.pt
684
+ 2026-01-24 22:48:51 | INFO | Best 3 checkpoints:
685
+ 2026-01-24 22:48:51 | INFO | 1. Step 2399: acc=0.6720 (reward_model.best_2399.pt)
686
+ 2026-01-24 22:48:51 | INFO | 2. Step 1999: acc=0.6705 (reward_model.best_1999.pt)
687
+ 2026-01-24 22:48:51 | INFO | 3. Step 2299: acc=0.6705 (reward_model.best_2299.pt)
688
+ 2026-01-24 22:48:52 | INFO | Step 3400: loss=0.0082 | IF_loss=0.0113, MQ_loss=0.0051 | acc=1.000 (IF=1.000, MQ=1.000) | lr=0.000006
689
+ 2026-01-24 22:50:41 | INFO |
690
+ ============================================================
691
+ Validation Results (took 7.19s):
692
+ Samples: 346 instruction, 346 quality
693
+ Instruction Acc: 0.6908
694
+ Quality Acc: 0.6590
695
+ Average Acc: 0.6749
696
+ Total Loss: 1.7151
697
+ Instruction Loss: 1.5847
698
+ Quality Loss: 1.8456
699
+ ============================================================
700
+ 2026-01-24 22:50:41 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_3499.pt (filtered to 38.584M trainable parameters)
701
+ 2026-01-24 22:50:41 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_3499.pt (575.2MB)
702
+ 2026-01-24 22:50:41 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_2299.pt
703
+ 2026-01-24 22:50:41 | INFO | Best 3 checkpoints:
704
+ 2026-01-24 22:50:41 | INFO | 1. Step 3499: acc=0.6749 (reward_model.best_3499.pt)
705
+ 2026-01-24 22:50:41 | INFO | 2. Step 2399: acc=0.6720 (reward_model.best_2399.pt)
706
+ 2026-01-24 22:50:41 | INFO | 3. Step 1999: acc=0.6705 (reward_model.best_1999.pt)
707
+ 2026-01-24 22:50:42 | INFO | Step 3500: loss=0.0045 | IF_loss=0.0077, MQ_loss=0.0013 | acc=1.000 (IF=1.000, MQ=1.000) | lr=0.000005
708
+ 2026-01-24 22:52:33 | INFO |
709
+ ============================================================
710
+ Validation Results (took 7.08s):
711
+ Samples: 346 instruction, 346 quality
712
+ Instruction Acc: 0.6879
713
+ Quality Acc: 0.6590
714
+ Average Acc: 0.6734
715
+ Total Loss: 1.7160
716
+ Instruction Loss: 1.5876
717
+ Quality Loss: 1.8445
718
+ ============================================================
719
+ 2026-01-24 22:52:33 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_3599.pt (filtered to 38.584M trainable parameters)
720
+ 2026-01-24 22:52:34 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_3599.pt (575.2MB)
721
+ 2026-01-24 22:52:34 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_1999.pt
722
+ 2026-01-24 22:52:34 | INFO | Best 3 checkpoints:
723
+ 2026-01-24 22:52:34 | INFO | 1. Step 3499: acc=0.6749 (reward_model.best_3499.pt)
724
+ 2026-01-24 22:52:34 | INFO | 2. Step 3599: acc=0.6734 (reward_model.best_3599.pt)
725
+ 2026-01-24 22:52:34 | INFO | 3. Step 2399: acc=0.6720 (reward_model.best_2399.pt)
726
+ 2026-01-24 22:52:35 | INFO | Step 3600: loss=0.0126 | IF_loss=0.0220, MQ_loss=0.0031 | acc=1.000 (IF=1.000, MQ=1.000) | lr=0.000003
727
+ 2026-01-24 22:54:24 | INFO |
728
+ ============================================================
729
+ Validation Results (took 7.20s):
730
+ Samples: 346 instruction, 346 quality
731
+ Instruction Acc: 0.6879
732
+ Quality Acc: 0.6590
733
+ Average Acc: 0.6734
734
+ Total Loss: 1.7161
735
+ Instruction Loss: 1.5894
736
+ Quality Loss: 1.8428
737
+ ============================================================
738
+ 2026-01-24 22:54:24 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_3699.pt (filtered to 38.584M trainable parameters)
739
+ 2026-01-24 22:54:24 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_3699.pt (575.2MB)
740
+ 2026-01-24 22:54:24 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_2399.pt
741
+ 2026-01-24 22:54:24 | INFO | Best 3 checkpoints:
742
+ 2026-01-24 22:54:24 | INFO | 1. Step 3499: acc=0.6749 (reward_model.best_3499.pt)
743
+ 2026-01-24 22:54:24 | INFO | 2. Step 3599: acc=0.6734 (reward_model.best_3599.pt)
744
+ 2026-01-24 22:54:24 | INFO | 3. Step 3699: acc=0.6734 (reward_model.best_3699.pt)
745
+ 2026-01-24 22:54:25 | INFO | Step 3700: loss=0.0085 | IF_loss=0.0041, MQ_loss=0.0130 | acc=1.000 (IF=1.000, MQ=1.000) | lr=0.000002
746
+ 2026-01-24 22:56:18 | INFO |
747
+ ============================================================
748
+ Validation Results (took 6.85s):
749
+ Samples: 346 instruction, 346 quality
750
+ Instruction Acc: 0.6879
751
+ Quality Acc: 0.6618
752
+ Average Acc: 0.6749
753
+ Total Loss: 1.7157
754
+ Instruction Loss: 1.5912
755
+ Quality Loss: 1.8403
756
+ ============================================================
757
+ 2026-01-24 22:56:18 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_3799.pt (filtered to 38.584M trainable parameters)
758
+ 2026-01-24 22:56:19 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_3799.pt (575.2MB)
759
+ 2026-01-24 22:56:19 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_3699.pt
760
+ 2026-01-24 22:56:19 | INFO | Best 3 checkpoints:
761
+ 2026-01-24 22:56:19 | INFO | 1. Step 3499: acc=0.6749 (reward_model.best_3499.pt)
762
+ 2026-01-24 22:56:19 | INFO | 2. Step 3799: acc=0.6749 (reward_model.best_3799.pt)
763
+ 2026-01-24 22:56:19 | INFO | 3. Step 3599: acc=0.6734 (reward_model.best_3599.pt)
764
+ 2026-01-24 22:56:20 | INFO | Step 3800: loss=0.0120 | IF_loss=0.0037, MQ_loss=0.0202 | acc=0.990 (IF=1.000, MQ=0.979) | lr=0.000001
765
+ 2026-01-24 22:58:09 | INFO |
766
+ ============================================================
767
+ Validation Results (took 7.39s):
768
+ Samples: 346 instruction, 346 quality
769
+ Instruction Acc: 0.6908
770
+ Quality Acc: 0.6590
771
+ Average Acc: 0.6749
772
+ Total Loss: 1.7163
773
+ Instruction Loss: 1.5935
774
+ Quality Loss: 1.8391
775
+ ============================================================
776
+ 2026-01-24 22:58:09 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_3899.pt (filtered to 38.584M trainable parameters)
777
+ 2026-01-24 22:58:10 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_3899.pt (575.2MB)
778
+ 2026-01-24 22:58:10 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_3599.pt
779
+ 2026-01-24 22:58:10 | INFO | Best 3 checkpoints:
780
+ 2026-01-24 22:58:10 | INFO | 1. Step 3499: acc=0.6749 (reward_model.best_3499.pt)
781
+ 2026-01-24 22:58:10 | INFO | 2. Step 3799: acc=0.6749 (reward_model.best_3799.pt)
782
+ 2026-01-24 22:58:10 | INFO | 3. Step 3899: acc=0.6749 (reward_model.best_3899.pt)
783
+ 2026-01-24 22:58:11 | INFO | Step 3900: loss=0.0060 | IF_loss=0.0040, MQ_loss=0.0080 | acc=1.000 (IF=1.000, MQ=1.000) | lr=0.000000
784
+ 2026-01-24 23:00:02 | INFO |
785
+ ============================================================
786
+ Validation Results (took 6.60s):
787
+ Samples: 346 instruction, 346 quality
788
+ Instruction Acc: 0.6908
789
+ Quality Acc: 0.6590
790
+ Average Acc: 0.6749
791
+ Total Loss: 1.7165
792
+ Instruction Loss: 1.5967
793
+ Quality Loss: 1.8363
794
+ ============================================================
795
+ 2026-01-24 23:00:02 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_3999.pt (filtered to 38.584M trainable parameters)
796
+ 2026-01-24 23:00:02 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_3999.pt (575.2MB)
797
+ 2026-01-24 23:00:02 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2143/ckpt/reward_model.best_3999.pt
798
+ 2026-01-24 23:00:02 | INFO | Best 3 checkpoints:
799
+ 2026-01-24 23:00:02 | INFO | 1. Step 3499: acc=0.6749 (reward_model.best_3499.pt)
800
+ 2026-01-24 23:00:02 | INFO | 2. Step 3799: acc=0.6749 (reward_model.best_3799.pt)
801
+ 2026-01-24 23:00:02 | INFO | 3. Step 3899: acc=0.6749 (reward_model.best_3899.pt)
802
+ 2026-01-24 23:00:02 | INFO | Training complete!
803
+ 2026-01-24 23:00:02 | INFO | Training complete!
20260124_2354/config.yaml ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ DEVICES: '3'
2
+ accelerate:
3
+ mixed_precision: bf16
4
+ basics:
5
+ random_seed: 42
6
+ save_dir: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model
7
+ dataset:
8
+ audio_dropout:
9
+ apply_to_eval: false
10
+ apply_to_ref: true
11
+ enabled: true
12
+ eval_only_on_training: true
13
+ max_duration: 1500
14
+ min_duration: 200
15
+ train_mode: start
16
+ cache_dir: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/tmp
17
+ db_path: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/backend/database.db
18
+ duration: 600.0
19
+ embedding_dir: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/supervised_embeddings
20
+ max_samples: null
21
+ max_val_samples: null
22
+ metadata_jsonl: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/CMI-Training/all_comparisons.jsonl
23
+ mode: raw_text_frozen_audio
24
+ preference_file: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/CMI-Training/human_annotations/train.json
25
+ sample_rate: 24000
26
+ val_preference_file: null
27
+ loss:
28
+ IF_ratio: 0.5
29
+ filter_ties: true
30
+ label_smoothing: 0.0
31
+ reduction: mean
32
+ model:
33
+ attention_mode: SA
34
+ attn_dropout: 0.0
35
+ category_embeddings: null
36
+ dim: 768
37
+ dim_head: 64
38
+ downsample:
39
+ configs:
40
+ conv2_4x:
41
+ factor: 4
42
+ kernel_size: 5
43
+ kind: conv*2
44
+ use_layernorm: true
45
+ conv_4x:
46
+ factor: 4
47
+ kernel_size: 5
48
+ kind: conv
49
+ stage: 1
50
+ use_layernorm: true
51
+ glu_4x:
52
+ factor: 4
53
+ kernel_size: 5
54
+ kind: gluconv*2+pw
55
+ use_layernorm: true
56
+ mean:
57
+ factor: 2
58
+ kind: mean
59
+ mean_4x:
60
+ dropout: 0.0
61
+ factor: 30
62
+ kind: mean+mlp
63
+ mlp_ratio: 2.0
64
+ none:
65
+ factor: 1
66
+ kind: none
67
+ eval: mean_4x
68
+ ref: null
69
+ text: none
70
+ ff_dropout: 0.0
71
+ ff_mult: 4
72
+ freeze_audio: true
73
+ freeze_text: true
74
+ gradient_checkpointing: false
75
+ heads: 8
76
+ joint_tf_depth: 1
77
+ load_config:
78
+ checkpoint_path: null
79
+ frozen_from_pretrained: true
80
+ pretrained_name: OpenMuQ/MuQ-MuLan-large
81
+ strict: false
82
+ mlp_dim: 768
83
+ mode: concat_text_late
84
+ model_name: OpenMuQ/MuQ-MuLan-large
85
+ name: reward
86
+ no_condition: false
87
+ null_embedding:
88
+ audio:
89
+ dropout: 0.5
90
+ length: 10
91
+ lyrics:
92
+ dropout: 0.3
93
+ length: 10
94
+ text:
95
+ dropout: 0
96
+ length: 10
97
+ output_dim: 2
98
+ prompt_tf_depth: 4
99
+ sr: 24000
100
+ text_encoder:
101
+ name: muq_mulan
102
+ tune: null
103
+ text_lora_config: null
104
+ train_muq_depth: 0
105
+ train_muqmulan: false
106
+ use_audio: true
107
+ use_layer_idx: -1
108
+ project_root: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena
109
+ run_name: null
110
+ train:
111
+ batch_size: 48
112
+ betas:
113
+ - 0.9
114
+ - 0.99
115
+ ema_decay: 0.9999
116
+ ema_update_every: 1
117
+ enable_gradient_checkpointing: true
118
+ force_clear_prev_results: false
119
+ grad_accum_every: 1
120
+ log_tensorboard: true
121
+ lr_schedule:
122
+ min_lr_ratio: 0.001
123
+ name: linear_cosine
124
+ total_steps: 4000
125
+ warmup_steps: 10
126
+ max_grad_norm: 1
127
+ mlp_lr: 1.0e-05
128
+ num_train_steps: 4000
129
+ num_valid_batches: null
130
+ num_workers: 8
131
+ other_lr: 1.0e-05
132
+ resume: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_0147/ckpt/reward_model.best_29999.pt
133
+ resume_optimizer: false
134
+ save_model_every: 2000
135
+ use_checkpoint_config: true
136
+ use_ema: false
137
+ use_lion: false
138
+ valid_batch_size: 20
139
+ valid_every: 100
140
+ valid_frac: 0.1
141
+ verify_weights_on_load: true
142
+ validate_only: false
20260124_2354/reward_model/1769270104.0081618/events.out.tfevents.1769270104.MACLAB-S004.3211506.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1c9d6a9609c7a84c7ece1c70819976b71fcbc41491a14dc042f92c982873761
3
+ size 503
20260124_2354/reward_model/1769270104.0091846/hparams.yml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ batch_size: 48
2
+ grad_accum_every: 1
3
+ learning_rate: 1.0e-05
4
+ num_train_steps: 4000
20260124_2354/reward_model/events.out.tfevents.1769270104.MACLAB-S004.3211506.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:202656c2b035463a9a8b723b88d4b0c93b2d50b91b2b2c0ebb10f09261494610
3
+ size 647887
20260124_2354/train.20260124_2354.log ADDED
@@ -0,0 +1,306 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-24 23:54:55 | INFO | Log file: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2354/train.20260124_2354.log
2
+ 2026-01-24 23:54:55 | INFO | Random seed set to 42
3
+ 2026-01-24 23:54:56 | INFO | Created RawTextFrozenAudioDataset with 3463 samples
4
+ 2026-01-24 23:54:56 | INFO | Split dataset into train (3117) and validation (346) sets (ratio: 10.00%)
5
+ 2026-01-24 23:54:56 | INFO | Will resume from checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_0147/ckpt/reward_model.best_29999.pt
6
+ 2026-01-24 23:54:56 | INFO | Using checkpoint config for model initialization (continue training mode)
7
+ 2026-01-24 23:55:02 | INFO | Created RewardAttentionModel with attention_mode=SA
8
+ 2026-01-24 23:55:02 | INFO | Created PreferenceLoss with filter_ties=True
9
+ 2026-01-24 23:55:03 | INFO | ✓ Gradient checkpointing enabled
10
+ 2026-01-24 23:55:03 | INFO | ✓ Audio cropping enabled: min=200, max=1500
11
+ 2026-01-24 23:55:03 | INFO | Apply to eval: False, ref: True
12
+ 2026-01-24 23:55:03 | INFO | Modes: train=random, val=start
13
+ 2026-01-24 23:55:03 | INFO | MLP head parameters: 1,186,563 params, lr=1e-05
14
+ 2026-01-24 23:55:03 | INFO | Other parameters: 37,397,634 params, lr=1e-05
15
+ 2026-01-24 23:55:03 | INFO | Using lr_schedule=linear_cosine warmup_steps=10 total_steps=4000
16
+ 2026-01-24 23:55:03 | INFO | Training with fixed validation set
17
+ 2026-01-24 23:55:03 | INFO | Train batch_size: 48, Valid batch_size: 20
18
+ 2026-01-24 23:55:03 | INFO | Missing keys (782): ['text_module.model.embeddings.word_embeddings.weight', 'text_module.model.embeddings.position_embeddings.weight', 'text_module.model.embeddings.token_type_embeddings.weight', 'text_module.model.embeddings.LayerNorm.weight', 'text_module.model.embeddings.LayerNorm.bias']...
19
+ 2026-01-24 23:55:03 | INFO | ✓ Starting from step 0 (transfer learning mode, ignoring checkpoint steps=29999)
20
+ 2026-01-24 23:55:03 | INFO | Resumed from /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_0147/ckpt/reward_model.best_29999.pt
21
+ 2026-01-24 23:55:03 | INFO | Parameters: 701.162M total, 38.584M trainable
22
+ 2026-01-24 23:55:03 | INFO | Text encoder (frozen): 328.389M
23
+ 2026-01-24 23:55:03 | INFO | Audio encoder (frozen): 334.189M
24
+ 2026-01-24 23:55:03 | INFO | Other trainable: 38.584M
25
+ 2026-01-24 23:55:03 | INFO | ℹ No LoRA configuration detected
26
+ 2026-01-24 23:55:04 | INFO | ============================================================
27
+ 2026-01-24 23:55:04 | INFO | Ready to start training
28
+ 2026-01-24 23:55:04 | INFO | ============================================================
29
+ 2026-01-24 23:55:04 | INFO | Starting training from step 0
30
+ 2026-01-24 23:55:04 | INFO | ===== Accelerator / CUDA Debug Info =====
31
+ 2026-01-24 23:55:04 | INFO | accelerator.device = cuda
32
+ 2026-01-24 23:55:04 | INFO | mixed_precision = bf16
33
+ 2026-01-24 23:55:04 | INFO | distributed_type = NO
34
+ 2026-01-24 23:55:04 | INFO | num_processes = 1
35
+ 2026-01-24 23:55:04 | INFO | process_index = 0
36
+ 2026-01-24 23:55:04 | INFO | is_main_process = True
37
+ 2026-01-24 23:55:04 | INFO | torch.cuda.is_available() = True
38
+ 2026-01-24 23:55:04 | INFO | torch.cuda.device_count() = 1
39
+ 2026-01-24 23:55:04 | INFO | current_device = 0
40
+ 2026-01-24 23:55:04 | INFO | device_name = NVIDIA GeForce RTX 4090
41
+ 2026-01-24 23:55:04 | INFO | model parameter device = cuda:0
42
+ 2026-01-24 23:55:04 | INFO | Training for 4000.0 steps (~63 epochs, 64 steps/epoch)
43
+ 2026-01-24 23:55:12 | INFO | Step 0: loss=1.8546 | IF_loss=2.4068, MQ_loss=1.3024 | acc=0.760 (IF=0.708, MQ=0.812) | lr=0.000002
44
+ 2026-01-24 23:55:12 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2354/ckpt/reward_model.0.pt (filtered to 38.584M trainable parameters)
45
+ 2026-01-24 23:55:12 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2354/ckpt/reward_model.0.pt (428.0MB)
46
+ 2026-01-24 23:55:12 | INFO | Step 0: Saved to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2354/ckpt/reward_model.0.pt
47
+ 2026-01-24 23:56:57 | INFO |
48
+ ============================================================
49
+ Validation Results (took 8.49s):
50
+ Samples: 346 instruction, 346 quality
51
+ Instruction Acc: 0.7052
52
+ Quality Acc: 0.6821
53
+ Average Acc: 0.6936
54
+ Total Loss: 1.2481
55
+ Instruction Loss: 1.1851
56
+ Quality Loss: 1.3111
57
+ ============================================================
58
+ 2026-01-24 23:56:57 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2354/ckpt/reward_model.best_99.pt (filtered to 38.584M trainable parameters)
59
+ 2026-01-24 23:56:57 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2354/ckpt/reward_model.best_99.pt (428.0MB)
60
+ 2026-01-24 23:56:57 | INFO | Best 1 checkpoints:
61
+ 2026-01-24 23:56:57 | INFO | 1. Step 99: acc=0.6936 (reward_model.best_99.pt)
62
+ 2026-01-24 23:56:58 | INFO | Step 100: loss=1.0138 | IF_loss=0.8556, MQ_loss=1.1720 | acc=0.708 (IF=0.688, MQ=0.729) | lr=0.000010
63
+ 2026-01-24 23:58:43 | INFO |
64
+ ============================================================
65
+ Validation Results (took 6.77s):
66
+ Samples: 346 instruction, 346 quality
67
+ Instruction Acc: 0.6965
68
+ Quality Acc: 0.7197
69
+ Average Acc: 0.7081
70
+ Total Loss: 0.7433
71
+ Instruction Loss: 0.7416
72
+ Quality Loss: 0.7450
73
+ ============================================================
74
+ 2026-01-24 23:58:44 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2354/ckpt/reward_model.best_199.pt (filtered to 38.584M trainable parameters)
75
+ 2026-01-24 23:58:44 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2354/ckpt/reward_model.best_199.pt (428.0MB)
76
+ 2026-01-24 23:58:44 | INFO | Best 2 checkpoints:
77
+ 2026-01-24 23:58:44 | INFO | 1. Step 199: acc=0.7081 (reward_model.best_199.pt)
78
+ 2026-01-24 23:58:44 | INFO | 2. Step 99: acc=0.6936 (reward_model.best_99.pt)
79
+ 2026-01-24 23:58:45 | INFO | Step 200: loss=0.4285 | IF_loss=0.4361, MQ_loss=0.4208 | acc=0.812 (IF=0.792, MQ=0.833) | lr=0.000010
80
+ 2026-01-25 00:00:32 | INFO |
81
+ ============================================================
82
+ Validation Results (took 7.53s):
83
+ Samples: 346 instruction, 346 quality
84
+ Instruction Acc: 0.7052
85
+ Quality Acc: 0.7514
86
+ Average Acc: 0.7283
87
+ Total Loss: 0.6484
88
+ Instruction Loss: 0.6697
89
+ Quality Loss: 0.6271
90
+ ============================================================
91
+ 2026-01-25 00:00:32 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2354/ckpt/reward_model.best_299.pt (filtered to 38.584M trainable parameters)
92
+ 2026-01-25 00:00:32 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2354/ckpt/reward_model.best_299.pt (428.0MB)
93
+ 2026-01-25 00:00:32 | INFO | Best 3 checkpoints:
94
+ 2026-01-25 00:00:32 | INFO | 1. Step 299: acc=0.7283 (reward_model.best_299.pt)
95
+ 2026-01-25 00:00:32 | INFO | 2. Step 199: acc=0.7081 (reward_model.best_199.pt)
96
+ 2026-01-25 00:00:32 | INFO | 3. Step 99: acc=0.6936 (reward_model.best_99.pt)
97
+ 2026-01-25 00:00:33 | INFO | Step 300: loss=0.3843 | IF_loss=0.4473, MQ_loss=0.3212 | acc=0.844 (IF=0.792, MQ=0.896) | lr=0.000010
98
+ 2026-01-25 00:02:21 | INFO |
99
+ ============================================================
100
+ Validation Results (took 7.10s):
101
+ Samples: 346 instruction, 346 quality
102
+ Instruction Acc: 0.6994
103
+ Quality Acc: 0.7399
104
+ Average Acc: 0.7197
105
+ Total Loss: 0.6475
106
+ Instruction Loss: 0.6784
107
+ Quality Loss: 0.6167
108
+ ============================================================
109
+ 2026-01-25 00:02:21 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2354/ckpt/reward_model.best_399.pt (filtered to 38.584M trainable parameters)
110
+ 2026-01-25 00:02:22 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2354/ckpt/reward_model.best_399.pt (428.0MB)
111
+ 2026-01-25 00:02:22 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2354/ckpt/reward_model.best_99.pt
112
+ 2026-01-25 00:02:22 | INFO | Best 3 checkpoints:
113
+ 2026-01-25 00:02:22 | INFO | 1. Step 299: acc=0.7283 (reward_model.best_299.pt)
114
+ 2026-01-25 00:02:22 | INFO | 2. Step 399: acc=0.7197 (reward_model.best_399.pt)
115
+ 2026-01-25 00:02:22 | INFO | 3. Step 199: acc=0.7081 (reward_model.best_199.pt)
116
+ 2026-01-25 00:02:23 | INFO | Step 400: loss=0.5100 | IF_loss=0.5393, MQ_loss=0.4806 | acc=0.771 (IF=0.729, MQ=0.812) | lr=0.000010
117
+ 2026-01-25 00:04:09 | INFO |
118
+ ============================================================
119
+ Validation Results (took 7.22s):
120
+ Samples: 346 instruction, 346 quality
121
+ Instruction Acc: 0.6965
122
+ Quality Acc: 0.7399
123
+ Average Acc: 0.7182
124
+ Total Loss: 0.6691
125
+ Instruction Loss: 0.7099
126
+ Quality Loss: 0.6283
127
+ ============================================================
128
+ 2026-01-25 00:04:09 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2354/ckpt/reward_model.best_499.pt (filtered to 38.584M trainable parameters)
129
+ 2026-01-25 00:04:09 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2354/ckpt/reward_model.best_499.pt (428.0MB)
130
+ 2026-01-25 00:04:09 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2354/ckpt/reward_model.best_199.pt
131
+ 2026-01-25 00:04:09 | INFO | Best 3 checkpoints:
132
+ 2026-01-25 00:04:09 | INFO | 1. Step 299: acc=0.7283 (reward_model.best_299.pt)
133
+ 2026-01-25 00:04:09 | INFO | 2. Step 399: acc=0.7197 (reward_model.best_399.pt)
134
+ 2026-01-25 00:04:09 | INFO | 3. Step 499: acc=0.7182 (reward_model.best_499.pt)
135
+ 2026-01-25 00:04:10 | INFO | Step 500: loss=0.4517 | IF_loss=0.5286, MQ_loss=0.3749 | acc=0.771 (IF=0.750, MQ=0.792) | lr=0.000010
136
+ 2026-01-25 00:06:01 | INFO |
137
+ ============================================================
138
+ Validation Results (took 7.07s):
139
+ Samples: 346 instruction, 346 quality
140
+ Instruction Acc: 0.6994
141
+ Quality Acc: 0.7457
142
+ Average Acc: 0.7225
143
+ Total Loss: 0.6932
144
+ Instruction Loss: 0.7406
145
+ Quality Loss: 0.6458
146
+ ============================================================
147
+ 2026-01-25 00:06:02 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2354/ckpt/reward_model.best_599.pt (filtered to 38.584M trainable parameters)
148
+ 2026-01-25 00:06:02 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2354/ckpt/reward_model.best_599.pt (428.0MB)
149
+ 2026-01-25 00:06:02 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2354/ckpt/reward_model.best_499.pt
150
+ 2026-01-25 00:06:02 | INFO | Best 3 checkpoints:
151
+ 2026-01-25 00:06:02 | INFO | 1. Step 299: acc=0.7283 (reward_model.best_299.pt)
152
+ 2026-01-25 00:06:02 | INFO | 2. Step 599: acc=0.7225 (reward_model.best_599.pt)
153
+ 2026-01-25 00:06:02 | INFO | 3. Step 399: acc=0.7197 (reward_model.best_399.pt)
154
+ 2026-01-25 00:06:03 | INFO | Step 600: loss=0.3738 | IF_loss=0.4063, MQ_loss=0.3414 | acc=0.854 (IF=0.854, MQ=0.854) | lr=0.000009
155
+ 2026-01-25 00:07:51 | INFO |
156
+ ============================================================
157
+ Validation Results (took 8.06s):
158
+ Samples: 346 instruction, 346 quality
159
+ Instruction Acc: 0.6965
160
+ Quality Acc: 0.7370
161
+ Average Acc: 0.7168
162
+ Total Loss: 0.7321
163
+ Loss: 0.7873
164
+ Quality Loss: 0.6769
165
+ ============================================================
166
+ 2026-01-25 00:07:51 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2354/ckpt/reward_model.best_699.pt (filtered to 38.584M trainable parameters)
167
+ 2026-01-25 00:07:51 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2354/ckpt/reward_model.best_699.pt (428.0MB)
168
+ 2026-01-25 00:07:51 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2354/ckpt/reward_model.best_699.pt
169
+ 2026-01-25 00:07:51 | INFO | Best 3 checkpoints:
170
+ 2026-01-25 00:07:51 | INFO | 1. Step 299: acc=0.7283 (reward_model.best_299.pt)
171
+ 2026-01-25 00:07:51 | INFO | 2. Step 599: acc=0.7225 (reward_model.best_599.pt)
172
+ 2026-01-25 00:07:51 | INFO | 3. Step 399: acc=0.7197 (reward_model.best_399.pt)
173
+ 2026-01-25 00:07:52 | INFO | Step 700: loss=0.2228 | IF_loss=0.2959, MQ_loss=0.1498 | acc=0.896 (IF=0.854, MQ=0.938) | lr=0.000009
174
+ 2026-01-25 00:09:41 | INFO |
175
+ ============================================================
176
+ Validation Results (took 6.89s):
177
+ Samples: 346 instruction, 346 quality
178
+ Instruction Acc: 0.6936
179
+ Quality Acc: 0.7341
180
+ Average Acc: 0.7139
181
+ Total Loss: 0.7643
182
+ Instruction Loss: 0.8224
183
+ Quality Loss: 0.7063
184
+ ============================================================
185
+ 2026-01-25 00:09:41 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2354/ckpt/reward_model.best_799.pt (filtered to 38.584M trainable parameters)
186
+ 2026-01-25 00:09:41 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2354/ckpt/reward_model.best_799.pt (428.0MB)
187
+ 2026-01-25 00:09:41 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2354/ckpt/reward_model.best_799.pt
188
+ 2026-01-25 00:09:41 | INFO | Best 3 checkpoints:
189
+ 2026-01-25 00:09:41 | INFO | 1. Step 299: acc=0.7283 (reward_model.best_299.pt)
190
+ 2026-01-25 00:09:41 | INFO | 2. Step 599: acc=0.7225 (reward_model.best_599.pt)
191
+ 2026-01-25 00:09:41 | INFO | 3. Step 399: acc=0.7197 (reward_model.best_399.pt)
192
+ 2026-01-25 00:09:42 | INFO | Step 800: loss=0.3570 | IF_loss=0.4116, MQ_loss=0.3024 | acc=0.792 (IF=0.771, MQ=0.812) | lr=0.000009
193
+ 2026-01-25 00:11:33 | INFO |
194
+ ============================================================
195
+ Validation Results (took 6.85s):
196
+ Samples: 346 instruction, 346 quality
197
+ Instruction Acc: 0.6850
198
+ Quality Acc: 0.7341
199
+ Average Acc: 0.7095
200
+ Total Loss: 0.7902
201
+ Instruction Loss: 0.8561
202
+ Quality Loss: 0.7244
203
+ ============================================================
204
+ 2026-01-25 00:11:33 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2354/ckpt/reward_model.best_899.pt (filtered to 38.584M trainable parameters)
205
+ 2026-01-25 00:11:33 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2354/ckpt/reward_model.best_899.pt (428.0MB)
206
+ 2026-01-25 00:11:33 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2354/ckpt/reward_model.best_899.pt
207
+ 2026-01-25 00:11:33 | INFO | Best 3 checkpoints:
208
+ 2026-01-25 00:11:33 | INFO | 1. Step 299: acc=0.7283 (reward_model.best_299.pt)
209
+ 2026-01-25 00:11:33 | INFO | 2. Step 599: acc=0.7225 (reward_model.best_599.pt)
210
+ 2026-01-25 00:11:33 | INFO | 3. Step 399: acc=0.7197 (reward_model.best_399.pt)
211
+ 2026-01-25 00:11:34 | INFO | Step 900: loss=0.2508 | IF_loss=0.2545, MQ_loss=0.2472 | acc=0.927 (IF=0.917, MQ=0.938) | lr=0.000009
212
+ 2026-01-25 00:13:21 | INFO |
213
+ ============================================================
214
+ Validation Results (took 7.29s):
215
+ Samples: 346 instruction, 346 quality
216
+ Instruction Acc: 0.6908
217
+ Quality Acc: 0.7254
218
+ Average Acc: 0.7081
219
+ Total Loss: 0.8355
220
+ Instruction Loss: 0.9110
221
+ Quality Loss: 0.7599
222
+ ============================================================
223
+ 2026-01-25 00:13:21 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2354/ckpt/reward_model.best_999.pt (filtered to 38.584M trainable parameters)
224
+ 2026-01-25 00:13:22 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2354/ckpt/reward_model.best_999.pt (428.0MB)
225
+ 2026-01-25 00:13:22 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2354/ckpt/reward_model.best_999.pt
226
+ 2026-01-25 00:13:22 | INFO | Best 3 checkpoints:
227
+ 2026-01-25 00:13:22 | INFO | 1. Step 299: acc=0.7283 (reward_model.best_299.pt)
228
+ 2026-01-25 00:13:22 | INFO | 2. Step 599: acc=0.7225 (reward_model.best_599.pt)
229
+ 2026-01-25 00:13:22 | INFO | 3. Step 399: acc=0.7197 (reward_model.best_399.pt)
230
+ 2026-01-25 00:13:23 | INFO | Step 1000: loss=0.2025 | IF_loss=0.1883, MQ_loss=0.2167 | acc=0.917 (IF=0.958, MQ=0.875) | lr=0.000009
231
+ 2026-01-25 00:15:12 | INFO |
232
+ ============================================================
233
+ Validation Results (took 7.33s):
234
+ Samples: 346 instruction, 346 quality
235
+ Instruction Acc: 0.6936
236
+ Quality Acc: 0.7312
237
+ Average Acc: 0.7124
238
+ Total Loss: 0.8744
239
+ Instruction Loss: 0.9563
240
+ Quality Loss: 0.7924
241
+ ============================================================
242
+ 2026-01-25 00:15:12 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2354/ckpt/reward_model.best_1099.pt (filtered to 38.584M trainable parameters)
243
+ 2026-01-25 00:15:12 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2354/ckpt/reward_model.best_1099.pt (428.0MB)
244
+ 2026-01-25 00:15:12 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2354/ckpt/reward_model.best_1099.pt
245
+ 2026-01-25 00:15:12 | INFO | Best 3 checkpoints:
246
+ 2026-01-25 00:15:12 | INFO | 1. Step 299: acc=0.7283 (reward_model.best_299.pt)
247
+ 2026-01-25 00:15:12 | INFO | 2. Step 599: acc=0.7225 (reward_model.best_599.pt)
248
+ 2026-01-25 00:15:12 | INFO | 3. Step 399: acc=0.7197 (reward_model.best_399.pt)
249
+ 2026-01-25 00:15:13 | INFO | Step 1100: loss=0.2070 | IF_loss=0.2735, MQ_loss=0.1405 | acc=0.927 (IF=0.896, MQ=0.958) | lr=0.000008
250
+ 2026-01-25 00:17:01 | INFO |
251
+ ============================================================
252
+ Validation Results (took 7.23s):
253
+ Samples: 346 instruction, 346 quality
254
+ Instruction Acc: 0.6936
255
+ Quality Acc: 0.7341
256
+ Average Acc: 0.7139
257
+ Total Loss: 0.9238
258
+ Instruction Loss: 1.0105
259
+ Quality Loss: 0.8370
260
+ ============================================================
261
+ 2026-01-25 00:17:01 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2354/ckpt/reward_model.best_1199.pt (filtered to 38.584M trainable parameters)
262
+ 2026-01-25 00:17:02 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2354/ckpt/reward_model.best_1199.pt (428.0MB)
263
+ 2026-01-25 00:17:02 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2354/ckpt/reward_model.best_1199.pt
264
+ 2026-01-25 00:17:02 | INFO | Best 3 checkpoints:
265
+ 2026-01-25 00:17:02 | INFO | 1. Step 299: acc=0.7283 (reward_model.best_299.pt)
266
+ 2026-01-25 00:17:02 | INFO | 2. Step 599: acc=0.7225 (reward_model.best_599.pt)
267
+ 2026-01-25 00:17:02 | INFO | 3. Step 399: acc=0.7197 (reward_model.best_399.pt)
268
+ 2026-01-25 00:17:03 | INFO | Step 1200: loss=0.1291 | IF_loss=0.1584, MQ_loss=0.0999 | acc=0.948 (IF=0.917, MQ=0.979) | lr=0.000008
269
+ 2026-01-25 00:18:54 | INFO |
270
+ ============================================================
271
+ Validation Results (took 7.69s):
272
+ Samples: 346 instruction, 346 quality
273
+ Instruction Acc: 0.6908
274
+ Quality Acc: 0.7225
275
+ Average Acc: 0.7066
276
+ Total Loss: 0.9501
277
+ Instruction Loss: 1.0487
278
+ Quality Loss: 0.8515
279
+ ============================================================
280
+ 2026-01-25 00:18:54 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2354/ckpt/reward_model.best_1299.pt (filtered to 38.584M trainable parameters)
281
+ 2026-01-25 00:18:54 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2354/ckpt/reward_model.best_1299.pt (428.0MB)
282
+ 2026-01-25 00:18:54 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2354/ckpt/reward_model.best_1299.pt
283
+ 2026-01-25 00:18:54 | INFO | Best 3 checkpoints:
284
+ 2026-01-25 00:18:54 | INFO | 1. Step 299: acc=0.7283 (reward_model.best_299.pt)
285
+ 2026-01-25 00:18:54 | INFO | 2. Step 599: acc=0.7225 (reward_model.best_599.pt)
286
+ 2026-01-25 00:18:54 | INFO | 3. Step 399: acc=0.7197 (reward_model.best_399.pt)
287
+ 2026-01-25 00:18:55 | INFO | Step 1300: loss=0.2189 | IF_loss=0.2415, MQ_loss=0.1962 | acc=0.917 (IF=0.896, MQ=0.938) | lr=0.000008
288
+ 2026-01-25 00:20:44 | INFO |
289
+ ============================================================
290
+ Validation Results (took 7.70s):
291
+ Samples: 346 instruction, 346 quality
292
+ Instruction Acc: 0.6994
293
+ Quality Acc: 0.7312
294
+ Average Acc: 0.7153
295
+ Total Loss: 1.0001
296
+ Instruction Loss: 1.1038
297
+ Quality Loss: 0.8963
298
+ ============================================================
299
+ 2026-01-25 00:20:44 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2354/ckpt/reward_model.best_1399.pt (filtered to 38.584M trainable parameters)
300
+ 2026-01-25 00:20:44 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2354/ckpt/reward_model.best_1399.pt (428.0MB)
301
+ 2026-01-25 00:20:44 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_2354/ckpt/reward_model.best_1399.pt
302
+ 2026-01-25 00:20:44 | INFO | Best 3 checkpoints:
303
+ 2026-01-25 00:20:44 | INFO | 1. Step 299: acc=0.7283 (reward_model.best_299.pt)
304
+ 2026-01-25 00:20:44 | INFO | 2. Step 599: acc=0.7225 (reward_model.best_599.pt)
305
+ 2026-01-25 00:20:44 | INFO | 3. Step 399: acc=0.7197 (reward_model.best_399.pt)
306
+ 2026-01-25 00:20:46 | INFO | Step 1400: loss=0.1710 | IF_loss=0.1713, MQ_loss=0.1707 | acc=0.938 (IF=0.917, MQ=0.958) | lr=0.000007
20260125_0035/config.yaml ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ DEVICES: '0'
2
+ accelerate:
3
+ mixed_precision: bf16
4
+ basics:
5
+ random_seed: 42
6
+ save_dir: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model
7
+ dataset:
8
+ audio_dropout:
9
+ apply_to_eval: true
10
+ apply_to_ref: true
11
+ enabled: true
12
+ eval_only_on_training: true
13
+ max_duration: 1500
14
+ min_duration: 200
15
+ train_mode: start
16
+ cache_dir: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/tmp
17
+ db_path: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/backend/database.db
18
+ duration: 600.0
19
+ embedding_dir: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/supervised_embeddings
20
+ max_samples: null
21
+ max_val_samples: null
22
+ metadata_jsonl: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/CMI-Training/all_comparisons.jsonl
23
+ mode: raw_text_frozen_audio
24
+ preference_file: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/CMI-Training/human_annotations/train.json
25
+ sample_rate: 24000
26
+ val_preference_file: null
27
+ loss:
28
+ IF_ratio: 0.5
29
+ filter_ties: true
30
+ label_smoothing: 0.0
31
+ reduction: mean
32
+ model:
33
+ attention_mode: SA
34
+ attn_dropout: 0.0
35
+ category_embeddings: null
36
+ dim: 768
37
+ dim_head: 64
38
+ downsample:
39
+ configs:
40
+ conv2_4x:
41
+ factor: 4
42
+ kernel_size: 5
43
+ kind: conv*2
44
+ use_layernorm: true
45
+ conv_4x:
46
+ factor: 4
47
+ kernel_size: 5
48
+ kind: conv
49
+ stage: 1
50
+ use_layernorm: true
51
+ glu_4x:
52
+ factor: 4
53
+ kernel_size: 5
54
+ kind: gluconv*2+pw
55
+ use_layernorm: true
56
+ mean:
57
+ factor: 2
58
+ kind: mean
59
+ mean_4x:
60
+ dropout: 0.0
61
+ factor: 30
62
+ kind: mean+mlp
63
+ mlp_ratio: 2.0
64
+ none:
65
+ factor: 1
66
+ kind: none
67
+ eval: mean_4x
68
+ ref: null
69
+ text: none
70
+ ff_dropout: 0.0
71
+ ff_mult: 4
72
+ freeze_audio: true
73
+ freeze_text: true
74
+ gradient_checkpointing: false
75
+ heads: 8
76
+ joint_tf_depth: 1
77
+ load_config:
78
+ checkpoint_path: null
79
+ frozen_from_pretrained: true
80
+ pretrained_name: OpenMuQ/MuQ-MuLan-large
81
+ strict: false
82
+ mlp_dim: 768
83
+ mode: concat_text_late
84
+ model_name: OpenMuQ/MuQ-MuLan-large
85
+ name: reward
86
+ no_condition: false
87
+ null_embedding:
88
+ audio:
89
+ dropout: 0.5
90
+ length: 10
91
+ lyrics:
92
+ dropout: 0.3
93
+ length: 10
94
+ text:
95
+ dropout: 0
96
+ length: 10
97
+ output_dim: 2
98
+ prompt_tf_depth: 4
99
+ sr: 24000
100
+ text_encoder:
101
+ name: muq_mulan
102
+ tune: null
103
+ text_lora_config: null
104
+ train_muq_depth: 0
105
+ train_muqmulan: false
106
+ use_audio: true
107
+ use_layer_idx: -1
108
+ project_root: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena
109
+ run_name: null
110
+ train:
111
+ batch_size: 48
112
+ betas:
113
+ - 0.9
114
+ - 0.99
115
+ ema_decay: 0.9999
116
+ ema_update_every: 1
117
+ enable_gradient_checkpointing: true
118
+ force_clear_prev_results: false
119
+ grad_accum_every: 1
120
+ log_tensorboard: true
121
+ lr_schedule:
122
+ min_lr_ratio: 0.001
123
+ name: linear_cosine
124
+ total_steps: 2000
125
+ warmup_steps: 10
126
+ max_grad_norm: 1
127
+ mlp_lr: 1.0e-05
128
+ num_train_steps: 2000
129
+ num_valid_batches: null
130
+ num_workers: 8
131
+ other_lr: 1.0e-05
132
+ resume: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_0147/ckpt/reward_model.8000.pt
133
+ resume_optimizer: false
134
+ save_model_every: 2000
135
+ use_checkpoint_config: true
136
+ use_ema: false
137
+ use_lion: false
138
+ valid_batch_size: 20
139
+ valid_every: 100
140
+ valid_frac: 0.1
141
+ verify_weights_on_load: true
142
+ validate_only: false
20260125_0035/reward_model/1769272544.7198617/events.out.tfevents.1769272544.MACLAB-S004.3403711.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e5b60b1838c4e344cf6890b2bdce509325a24d7cb04497d228623da81ae0116
3
+ size 503
20260125_0035/reward_model/1769272544.7213397/hparams.yml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ batch_size: 48
2
+ grad_accum_every: 1
3
+ learning_rate: 1.0e-05
4
+ num_train_steps: 2000
20260125_0035/reward_model/events.out.tfevents.1769272544.MACLAB-S004.3403711.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d068740b9c57c21acd78084f3e19ca1c6abadd2922a09126bbb46b8e1f5f7901
3
+ size 873949
20260125_0035/train.20260125_0035.log ADDED
@@ -0,0 +1,421 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-25 00:35:33 | INFO | Log file: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/train.20260125_0035.log
2
+ 2026-01-25 00:35:33 | INFO | Random seed set to 42
3
+ 2026-01-25 00:35:35 | INFO | Created RawTextFrozenAudioDataset with 3463 samples
4
+ 2026-01-25 00:35:35 | INFO | Split dataset into train (3117) and validation (346) sets (ratio: 10.00%)
5
+ 2026-01-25 00:35:35 | INFO | Will resume from checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_0147/ckpt/reward_model.8000.pt
6
+ 2026-01-25 00:35:35 | INFO | Using checkpoint config for model initialization (continue training mode)
7
+ 2026-01-25 00:35:42 | INFO | Created RewardAttentionModel with attention_mode=SA
8
+ 2026-01-25 00:35:42 | INFO | Created PreferenceLoss with filter_ties=True
9
+ 2026-01-25 00:35:42 | INFO | ✓ Gradient checkpointing enabled
10
+ 2026-01-25 00:35:42 | INFO | ✓ Audio cropping enabled: min=200, max=1500
11
+ 2026-01-25 00:35:42 | INFO | Apply to eval: True, ref: True
12
+ 2026-01-25 00:35:42 | INFO | Modes: train=random, val=start
13
+ 2026-01-25 00:35:42 | INFO | MLP head parameters: 1,186,563 params, lr=1e-05
14
+ 2026-01-25 00:35:42 | INFO | Other parameters: 37,397,634 params, lr=1e-05
15
+ 2026-01-25 00:35:42 | INFO | Using lr_schedule=linear_cosine warmup_steps=10 total_steps=2000
16
+ 2026-01-25 00:35:42 | INFO | Training with fixed validation set
17
+ 2026-01-25 00:35:42 | INFO | Train batch_size: 48, Valid batch_size: 20
18
+ 2026-01-25 00:35:44 | INFO | Missing keys (782): ['text_module.model.embeddings.word_embeddings.weight', 'text_module.model.embeddings.position_embeddings.weight', 'text_module.model.embeddings.token_type_embeddings.weight', 'text_module.model.embeddings.LayerNorm.weight', 'text_module.model.embeddings.LayerNorm.bias']...
19
+ 2026-01-25 00:35:44 | INFO | ✓ Starting from step 0 (transfer learning mode, ignoring checkpoint steps=8000)
20
+ 2026-01-25 00:35:44 | INFO | Resumed from /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_0147/ckpt/reward_model.8000.pt
21
+ 2026-01-25 00:35:44 | INFO | Parameters: 701.162M total, 38.584M trainable
22
+ 2026-01-25 00:35:44 | INFO | Text encoder (frozen): 328.389M
23
+ 2026-01-25 00:35:44 | INFO | Audio encoder (frozen): 334.189M
24
+ 2026-01-25 00:35:44 | INFO | Other trainable: 38.584M
25
+ 2026-01-25 00:35:44 | INFO | ℹ No LoRA configuration detected
26
+ 2026-01-25 00:35:44 | INFO | ============================================================
27
+ 2026-01-25 00:35:44 | INFO | Ready to start training
28
+ 2026-01-25 00:35:44 | INFO | ============================================================
29
+ 2026-01-25 00:35:44 | INFO | Starting training from step 0
30
+ 2026-01-25 00:35:44 | INFO | ===== Accelerator / CUDA Debug Info =====
31
+ 2026-01-25 00:35:44 | INFO | accelerator.device = cuda
32
+ 2026-01-25 00:35:44 | INFO | mixed_precision = bf16
33
+ 2026-01-25 00:35:44 | INFO | distributed_type = NO
34
+ 2026-01-25 00:35:44 | INFO | num_processes = 1
35
+ 2026-01-25 00:35:44 | INFO | process_index = 0
36
+ 2026-01-25 00:35:44 | INFO | is_main_process = True
37
+ 2026-01-25 00:35:44 | INFO | torch.cuda.is_available() = True
38
+ 2026-01-25 00:35:44 | INFO | torch.cuda.device_count() = 1
39
+ 2026-01-25 00:35:44 | INFO | current_device = 0
40
+ 2026-01-25 00:35:44 | INFO | device_name = NVIDIA GeForce RTX 4090
41
+ 2026-01-25 00:35:44 | INFO | model parameter device = cuda:0
42
+ 2026-01-25 00:35:44 | INFO | Training for 2000.0 steps (~32 epochs, 64 steps/epoch)
43
+ 2026-01-25 00:35:52 | INFO | Step 0: loss=0.7688 | IF_loss=0.9857, MQ_loss=0.5519 | acc=0.729 (IF=0.688, MQ=0.771) | lr=0.000002
44
+ 2026-01-25 00:35:52 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.0.pt (filtered to 38.584M trainable parameters)
45
+ 2026-01-25 00:35:53 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.0.pt (428.0MB)
46
+ 2026-01-25 00:35:53 | INFO | Step 0: Saved to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.0.pt
47
+ 2026-01-25 00:37:35 | INFO |
48
+ ============================================================
49
+ Validation Results (took 8.15s):
50
+ Samples: 346 instruction, 346 quality
51
+ Instruction Acc: 0.7052
52
+ Quality Acc: 0.7052
53
+ Average Acc: 0.7052
54
+ Total Loss: 0.6842
55
+ Instruction Loss: 0.6988
56
+ Quality Loss: 0.6695
57
+ ============================================================
58
+ 2026-01-25 00:37:36 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_99.pt (filtered to 38.584M trainable parameters)
59
+ 2026-01-25 00:37:36 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_99.pt (428.0MB)
60
+ 2026-01-25 00:37:36 | INFO | Best 1 checkpoints:
61
+ 2026-01-25 00:37:36 | INFO | 1. Step 99: acc=0.7052 (reward_model.best_99.pt)
62
+ 2026-01-25 00:37:37 | INFO | Step 100: loss=0.5884 | IF_loss=0.5924, MQ_loss=0.5843 | acc=0.688 (IF=0.646, MQ=0.729) | lr=0.000010
63
+ 2026-01-25 00:39:28 | INFO |
64
+ ============================================================
65
+ Validation Results (took 9.30s):
66
+ Samples: 346 instruction, 346 quality
67
+ Instruction Acc: 0.7197
68
+ Quality Acc: 0.7457
69
+ Average Acc: 0.7327
70
+ Total Loss: 0.6014
71
+ Instruction Loss: 0.6253
72
+ Quality Loss: 0.5774
73
+ ============================================================
74
+ 2026-01-25 00:39:28 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_199.pt (filtered to 38.584M trainable parameters)
75
+ 2026-01-25 00:39:28 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_199.pt (428.0MB)
76
+ 2026-01-25 00:39:28 | INFO | Best 2 checkpoints:
77
+ 2026-01-25 00:39:28 | INFO | 1. Step 199: acc=0.7327 (reward_model.best_199.pt)
78
+ 2026-01-25 00:39:28 | INFO | 2. Step 99: acc=0.7052 (reward_model.best_99.pt)
79
+ 2026-01-25 00:39:29 | INFO | Step 200: loss=0.3779 | IF_loss=0.3349, MQ_loss=0.4209 | acc=0.844 (IF=0.896, MQ=0.792) | lr=0.000010
80
+ 2026-01-25 00:41:19 | INFO |
81
+ ============================================================
82
+ Validation Results (took 7.41s):
83
+ Samples: 346 instruction, 346 quality
84
+ Instruction Acc: 0.7110
85
+ Quality Acc: 0.7514
86
+ Average Acc: 0.7312
87
+ Total Loss: 0.5899
88
+ Instruction Loss: 0.6186
89
+ Quality Loss: 0.5612
90
+ ============================================================
91
+ 2026-01-25 00:41:19 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_299.pt (filtered to 38.584M trainable parameters)
92
+ 2026-01-25 00:41:19 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_299.pt (428.0MB)
93
+ 2026-01-25 00:41:19 | INFO | Best 3 checkpoints:
94
+ 2026-01-25 00:41:19 | INFO | 1. Step 199: acc=0.7327 (reward_model.best_199.pt)
95
+ 2026-01-25 00:41:19 | INFO | 2. Step 299: acc=0.7312 (reward_model.best_299.pt)
96
+ 2026-01-25 00:41:19 | INFO | 3. Step 99: acc=0.7052 (reward_model.best_99.pt)
97
+ 2026-01-25 00:41:20 | INFO | Step 300: loss=0.3940 | IF_loss=0.4496, MQ_loss=0.3384 | acc=0.802 (IF=0.792, MQ=0.812) | lr=0.000009
98
+ 2026-01-25 00:43:18 | INFO |
99
+ ============================================================
100
+ Validation Results (took 8.84s):
101
+ Samples: 346 instruction, 346 quality
102
+ Instruction Acc: 0.7168
103
+ Quality Acc: 0.7399
104
+ Average Acc: 0.7283
105
+ Total Loss: 0.5863
106
+ Instruction Loss: 0.6144
107
+ Quality Loss: 0.5582
108
+ ============================================================
109
+ 2026-01-25 00:43:18 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_399.pt (filtered to 38.584M trainable parameters)
110
+ 2026-01-25 00:43:18 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_399.pt (428.0MB)
111
+ 2026-01-25 00:43:18 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_99.pt
112
+ 2026-01-25 00:43:18 | INFO | Best 3 checkpoints:
113
+ 2026-01-25 00:43:18 | INFO | 1. Step 199: acc=0.7327 (reward_model.best_199.pt)
114
+ 2026-01-25 00:43:18 | INFO | 2. Step 299: acc=0.7312 (reward_model.best_299.pt)
115
+ 2026-01-25 00:43:18 | INFO | 3. Step 399: acc=0.7283 (reward_model.best_399.pt)
116
+ 2026-01-25 00:43:19 | INFO | Step 400: loss=0.5238 | IF_loss=0.5729, MQ_loss=0.4747 | acc=0.771 (IF=0.708, MQ=0.833) | lr=0.000009
117
+ 2026-01-25 00:45:10 | INFO |
118
+ ============================================================
119
+ Validation Results (took 7.63s):
120
+ Samples: 346 instruction, 346 quality
121
+ Instruction Acc: 0.7052
122
+ Quality Acc: 0.7428
123
+ Average Acc: 0.7240
124
+ Total Loss: 0.5915
125
+ Instruction Loss: 0.6221
126
+ Quality Loss: 0.5608
127
+ ============================================================
128
+ 2026-01-25 00:45:10 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_499.pt (filtered to 38.584M trainable parameters)
129
+ 2026-01-25 00:45:10 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_499.pt (428.0MB)
130
+ 2026-01-25 00:45:10 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_499.pt
131
+ 2026-01-25 00:45:10 | INFO | Best 3 checkpoints:
132
+ 2026-01-25 00:45:10 | INFO | 1. Step 199: acc=0.7327 (reward_model.best_199.pt)
133
+ 2026-01-25 00:45:10 | INFO | 2. Step 299: acc=0.7312 (reward_model.best_299.pt)
134
+ 2026-01-25 00:45:10 | INFO | 3. Step 399: acc=0.7283 (reward_model.best_399.pt)
135
+ 2026-01-25 00:45:11 | INFO | Step 500: loss=0.4478 | IF_loss=0.4706, MQ_loss=0.4250 | acc=0.760 (IF=0.792, MQ=0.729) | lr=0.000009
136
+ 2026-01-25 00:47:06 | INFO |
137
+ ============================================================
138
+ Validation Results (took 8.13s):
139
+ Samples: 346 instruction, 346 quality
140
+ Instruction Acc: 0.6994
141
+ Quality Acc: 0.7486
142
+ Average Acc: 0.7240
143
+ Total Loss: 0.5893
144
+ Instruction Loss: 0.6203
145
+ Quality Loss: 0.5584
146
+ ============================================================
147
+ 2026-01-25 00:47:06 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_599.pt (filtered to 38.584M trainable parameters)
148
+ 2026-01-25 00:47:06 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_599.pt (428.0MB)
149
+ 2026-01-25 00:47:06 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_599.pt
150
+ 2026-01-25 00:47:06 | INFO | Best 3 checkpoints:
151
+ 2026-01-25 00:47:06 | INFO | 1. Step 199: acc=0.7327 (reward_model.best_199.pt)
152
+ 2026-01-25 00:47:06 | INFO | 2. Step 299: acc=0.7312 (reward_model.best_299.pt)
153
+ 2026-01-25 00:47:06 | INFO | 3. Step 399: acc=0.7283 (reward_model.best_399.pt)
154
+ 2026-01-25 00:47:08 | INFO | Step 600: loss=0.4813 | IF_loss=0.4879, MQ_loss=0.4747 | acc=0.802 (IF=0.792, MQ=0.812) | lr=0.000008
155
+ 2026-01-25 00:49:00 | INFO |
156
+ ============================================================
157
+ Validation Results (took 7.74s):
158
+ Samples: 346 instruction, 346 quality
159
+ Instruction Acc: 0.7023
160
+ Quality Acc: 0.7486
161
+ Average Acc: 0.7254
162
+ Total Loss: 0.5964
163
+ Instruction Loss: 0.6307
164
+ Quality Loss: 0.5621
165
+ ============================================================
166
+ 2026-01-25 00:49:00 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_699.pt (filtered to 38.584M trainable parameters)
167
+ 2026-01-25 00:49:00 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_699.pt (428.0MB)
168
+ 2026-01-25 00:49:00 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_699.pt
169
+ 2026-01-25 00:49:00 | INFO | Best 3 checkpoints:
170
+ 2026-01-25 00:49:00 | INFO | 1. Step 199: acc=0.7327 (reward_model.best_199.pt)
171
+ 2026-01-25 00:49:00 | INFO | 2. Step 299: acc=0.7312 (reward_model.best_299.pt)
172
+ 2026-01-25 00:49:00 | INFO | 3. Step 399: acc=0.7283 (reward_model.best_399.pt)
173
+ 2026-01-25 00:49:01 | INFO | Step 700: loss=0.2444 | IF_loss=0.3514, MQ_loss=0.1374 | acc=0.927 (IF=0.896, MQ=0.958) | lr=0.000007
174
+ 2026-01-25 00:51:03 | INFO |
175
+ ============================================================
176
+ Validation Results (took 8.47s):
177
+ Samples: 346 instruction, 346 quality
178
+ Instruction Acc: 0.7023
179
+ Quality Acc: 0.7457
180
+ Average Acc: 0.7240
181
+ Total Loss: 0.6049
182
+ Instruction Loss: 0.6406
183
+ Quality Loss: 0.5693
184
+ ============================================================
185
+ 2026-01-25 00:51:03 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_799.pt (filtered to 38.584M trainable parameters)
186
+ 2026-01-25 00:51:03 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_799.pt (428.0MB)
187
+ 2026-01-25 00:51:03 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_799.pt
188
+ 2026-01-25 00:51:03 | INFO | Best 3 checkpoints:
189
+ 2026-01-25 00:51:03 | INFO | 1. Step 199: acc=0.7327 (reward_model.best_199.pt)
190
+ 2026-01-25 00:51:03 | INFO | 2. Step 299: acc=0.7312 (reward_model.best_299.pt)
191
+ 2026-01-25 00:51:03 | INFO | 3. Step 399: acc=0.7283 (reward_model.best_399.pt)
192
+ 2026-01-25 00:51:04 | INFO | Step 800: loss=0.4378 | IF_loss=0.5861, MQ_loss=0.2894 | acc=0.729 (IF=0.625, MQ=0.833) | lr=0.000007
193
+ 2026-01-25 00:53:00 | INFO |
194
+ ============================================================
195
+ Validation Results (took 7.86s):
196
+ Samples: 346 instruction, 346 quality
197
+ Instruction Acc: 0.7023
198
+ Quality Acc: 0.7543
199
+ Average Acc: 0.7283
200
+ Total Loss: 0.6092
201
+ Instruction Loss: 0.6455
202
+ Quality Loss: 0.5729
203
+ ============================================================
204
+ 2026-01-25 00:53:00 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_899.pt (filtered to 38.584M trainable parameters)
205
+ 2026-01-25 00:53:00 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_899.pt (428.0MB)
206
+ 2026-01-25 00:53:00 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_899.pt
207
+ 2026-01-25 00:53:00 | INFO | Best 3 checkpoints:
208
+ 2026-01-25 00:53:00 | INFO | 1. Step 199: acc=0.7327 (reward_model.best_199.pt)
209
+ 2026-01-25 00:53:00 | INFO | 2. Step 299: acc=0.7312 (reward_model.best_299.pt)
210
+ 2026-01-25 00:53:00 | INFO | 3. Step 399: acc=0.7283 (reward_model.best_399.pt)
211
+ 2026-01-25 00:53:01 | INFO | Step 900: loss=0.4075 | IF_loss=0.4561, MQ_loss=0.3589 | acc=0.771 (IF=0.750, MQ=0.792) | lr=0.000006
212
+ 2026-01-25 00:54:54 | INFO |
213
+ ============================================================
214
+ Validation Results (took 8.39s):
215
+ Samples: 346 instruction, 346 quality
216
+ Instruction Acc: 0.6936
217
+ Quality Acc: 0.7543
218
+ Average Acc: 0.7240
219
+ Total Loss: 0.6166
220
+ Instruction Loss: 0.6537
221
+ Quality Loss: 0.5795
222
+ ============================================================
223
+ 2026-01-25 00:54:54 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_999.pt (filtered to 38.584M trainable parameters)
224
+ 2026-01-25 00:54:55 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_999.pt (428.0MB)
225
+ 2026-01-25 00:54:55 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_999.pt
226
+ 2026-01-25 00:54:55 | INFO | Best 3 checkpoints:
227
+ 2026-01-25 00:54:55 | INFO | 1. Step 199: acc=0.7327 (reward_model.best_199.pt)
228
+ 2026-01-25 00:54:55 | INFO | 2. Step 299: acc=0.7312 (reward_model.best_299.pt)
229
+ 2026-01-25 00:54:55 | INFO | 3. Step 399: acc=0.7283 (reward_model.best_399.pt)
230
+ 2026-01-25 00:54:56 | INFO | Step 1000: loss=0.3655 | IF_loss=0.3244, MQ_loss=0.4067 | acc=0.823 (IF=0.833, MQ=0.812) | lr=0.000005
231
+ 2026-01-25 00:56:54 | INFO |
232
+ ============================================================
233
+ Validation Results (took 7.77s):
234
+ Samples: 346 instruction, 346 quality
235
+ Instruction Acc: 0.6994
236
+ Quality Acc: 0.7514
237
+ Average Acc: 0.7254
238
+ Total Loss: 0.6242
239
+ Instruction Loss: 0.6619
240
+ Quality Loss: 0.5864
241
+ ============================================================
242
+ 2026-01-25 00:56:54 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_1099.pt (filtered to 38.584M trainable parameters)
243
+ 2026-01-25 00:56:54 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_1099.pt (428.0MB)
244
+ 2026-01-25 00:56:54 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_1099.pt
245
+ 2026-01-25 00:56:54 | INFO | Best 3 checkpoints:
246
+ 2026-01-25 00:56:54 | INFO | 1. Step 199: acc=0.7327 (reward_model.best_199.pt)
247
+ 2026-01-25 00:56:54 | INFO | 2. Step 299: acc=0.7312 (reward_model.best_299.pt)
248
+ 2026-01-25 00:56:54 | INFO | 3. Step 399: acc=0.7283 (reward_model.best_399.pt)
249
+ 2026-01-25 00:56:55 | INFO | Step 1100: loss=0.3254 | IF_loss=0.3815, MQ_loss=0.2692 | acc=0.865 (IF=0.854, MQ=0.875) | lr=0.000004
250
+ 2026-01-25 00:58:51 | INFO |
251
+ ============================================================
252
+ Validation Results (took 8.71s):
253
+ Samples: 346 instruction, 346 quality
254
+ Instruction Acc: 0.6965
255
+ Quality Acc: 0.7514
256
+ Average Acc: 0.7240
257
+ Total Loss: 0.6286
258
+ Instruction Loss: 0.6684
259
+ Quality Loss: 0.5887
260
+ ============================================================
261
+ 2026-01-25 00:58:51 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_1199.pt (filtered to 38.584M trainable parameters)
262
+ 2026-01-25 00:58:52 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_1199.pt (428.0MB)
263
+ 2026-01-25 00:58:52 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_1199.pt
264
+ 2026-01-25 00:58:52 | INFO | Best 3 checkpoints:
265
+ 2026-01-25 00:58:52 | INFO | 1. Step 199: acc=0.7327 (reward_model.best_199.pt)
266
+ 2026-01-25 00:58:52 | INFO | 2. Step 299: acc=0.7312 (reward_model.best_299.pt)
267
+ 2026-01-25 00:58:52 | INFO | 3. Step 399: acc=0.7283 (reward_model.best_399.pt)
268
+ 2026-01-25 00:58:53 | INFO | Step 1200: loss=0.2899 | IF_loss=0.3551, MQ_loss=0.2248 | acc=0.823 (IF=0.750, MQ=0.896) | lr=0.000003
269
+ 2026-01-25 01:00:48 | INFO |
270
+ ============================================================
271
+ Validation Results (took 7.62s):
272
+ Samples: 346 instruction, 346 quality
273
+ Instruction Acc: 0.6965
274
+ Quality Acc: 0.7543
275
+ Average Acc: 0.7254
276
+ Total Loss: 0.6329
277
+ Instruction Loss: 0.6736
278
+ Quality Loss: 0.5922
279
+ ============================================================
280
+ 2026-01-25 01:00:48 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_1299.pt (filtered to 38.584M trainable parameters)
281
+ 2026-01-25 01:00:48 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_1299.pt (428.0MB)
282
+ 2026-01-25 01:00:48 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_1299.pt
283
+ 2026-01-25 01:00:48 | INFO | Best 3 checkpoints:
284
+ 2026-01-25 01:00:48 | INFO | 1. Step 199: acc=0.7327 (reward_model.best_199.pt)
285
+ 2026-01-25 01:00:48 | INFO | 2. Step 299: acc=0.7312 (reward_model.best_299.pt)
286
+ 2026-01-25 01:00:48 | INFO | 3. Step 399: acc=0.7283 (reward_model.best_399.pt)
287
+ 2026-01-25 01:00:49 | INFO | Step 1300: loss=0.3270 | IF_loss=0.3120, MQ_loss=0.3420 | acc=0.875 (IF=0.917, MQ=0.833) | lr=0.000003
288
+ 2026-01-25 01:02:37 | INFO |
289
+ ============================================================
290
+ Validation Results (took 6.98s):
291
+ Samples: 346 instruction, 346 quality
292
+ Instruction Acc: 0.6994
293
+ Quality Acc: 0.7514
294
+ Average Acc: 0.7254
295
+ Total Loss: 0.6344
296
+ Instruction Loss: 0.6752
297
+ Quality Loss: 0.5936
298
+ ============================================================
299
+ 2026-01-25 01:02:37 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_1399.pt (filtered to 38.584M trainable parameters)
300
+ 2026-01-25 01:02:38 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_1399.pt (428.0MB)
301
+ 2026-01-25 01:02:38 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_1399.pt
302
+ 2026-01-25 01:02:38 | INFO | Best 3 checkpoints:
303
+ 2026-01-25 01:02:38 | INFO | 1. Step 199: acc=0.7327 (reward_model.best_199.pt)
304
+ 2026-01-25 01:02:38 | INFO | 2. Step 299: acc=0.7312 (reward_model.best_299.pt)
305
+ 2026-01-25 01:02:38 | INFO | 3. Step 399: acc=0.7283 (reward_model.best_399.pt)
306
+ 2026-01-25 01:02:39 | INFO | Step 1400: loss=0.3501 | IF_loss=0.4404, MQ_loss=0.2599 | acc=0.854 (IF=0.812, MQ=0.896) | lr=0.000002
307
+ 2026-01-25 01:04:28 | INFO |
308
+ ============================================================
309
+ Validation Results (took 7.15s):
310
+ Samples: 346 instruction, 346 quality
311
+ Instruction Acc: 0.6965
312
+ Quality Acc: 0.7514
313
+ Average Acc: 0.7240
314
+ Total Loss: 0.6387
315
+ Instruction Loss: 0.6798
316
+ Quality Loss: 0.5976
317
+ ============================================================
318
+ 2026-01-25 01:04:28 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_1499.pt (filtered to 38.584M trainable parameters)
319
+ 2026-01-25 01:04:29 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_1499.pt (428.0MB)
320
+ 2026-01-25 01:04:29 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_1499.pt
321
+ 2026-01-25 01:04:29 | INFO | Best 3 checkpoints:
322
+ 2026-01-25 01:04:29 | INFO | 1. Step 199: acc=0.7327 (reward_model.best_199.pt)
323
+ 2026-01-25 01:04:29 | INFO | 2. Step 299: acc=0.7312 (reward_model.best_299.pt)
324
+ 2026-01-25 01:04:29 | INFO | 3. Step 399: acc=0.7283 (reward_model.best_399.pt)
325
+ 2026-01-25 01:04:30 | INFO | Step 1500: loss=0.2991 | IF_loss=0.3190, MQ_loss=0.2793 | acc=0.833 (IF=0.833, MQ=0.833) | lr=0.000001
326
+ 2026-01-25 01:06:20 | INFO |
327
+ ============================================================
328
+ Validation Results (took 7.69s):
329
+ Samples: 346 instruction, 346 quality
330
+ Instruction Acc: 0.6994
331
+ Quality Acc: 0.7514
332
+ Average Acc: 0.7254
333
+ Total Loss: 0.6398
334
+ Instruction Loss: 0.6813
335
+ Quality Loss: 0.5983
336
+ ============================================================
337
+ 2026-01-25 01:06:21 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_1599.pt (filtered to 38.584M trainable parameters)
338
+ 2026-01-25 01:06:21 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_1599.pt (428.0MB)
339
+ 2026-01-25 01:06:21 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_1599.pt
340
+ 2026-01-25 01:06:21 | INFO | Best 3 checkpoints:
341
+ 2026-01-25 01:06:21 | INFO | 1. Step 199: acc=0.7327 (reward_model.best_199.pt)
342
+ 2026-01-25 01:06:21 | INFO | 2. Step 299: acc=0.7312 (reward_model.best_299.pt)
343
+ 2026-01-25 01:06:21 | INFO | 3. Step 399: acc=0.7283 (reward_model.best_399.pt)
344
+ 2026-01-25 01:06:25 | INFO | Step 1600: loss=0.2735 | IF_loss=0.3038, MQ_loss=0.2432 | acc=0.906 (IF=0.875, MQ=0.938) | lr=0.000001
345
+ 2026-01-25 01:08:16 | INFO |
346
+ ============================================================
347
+ Validation Results (took 7.81s):
348
+ Samples: 346 instruction, 346 quality
349
+ Instruction Acc: 0.6994
350
+ Quality Acc: 0.7514
351
+ Average Acc: 0.7254
352
+ Total Loss: 0.6407
353
+ Instruction Loss: 0.6825
354
+ Quality Loss: 0.5989
355
+ ============================================================
356
+ 2026-01-25 01:08:16 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_1699.pt (filtered to 38.584M trainable parameters)
357
+ 2026-01-25 01:08:16 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_1699.pt (428.0MB)
358
+ 2026-01-25 01:08:16 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_1699.pt
359
+ 2026-01-25 01:08:16 | INFO | Best 3 checkpoints:
360
+ 2026-01-25 01:08:16 | INFO | 1. Step 199: acc=0.7327 (reward_model.best_199.pt)
361
+ 2026-01-25 01:08:16 | INFO | 2. Step 299: acc=0.7312 (reward_model.best_299.pt)
362
+ 2026-01-25 01:08:16 | INFO | 3. Step 399: acc=0.7283 (reward_model.best_399.pt)
363
+ 2026-01-25 01:08:17 | INFO | Step 1700: loss=0.3877 | IF_loss=0.2611, MQ_loss=0.5142 | acc=0.771 (IF=0.896, MQ=0.646) | lr=0.000001
364
+ 2026-01-25 01:10:15 | INFO |
365
+ ============================================================
366
+ Validation Results (took 7.66s):
367
+ Samples: 346 instruction, 346 quality
368
+ Instruction Acc: 0.6994
369
+ Quality Acc: 0.7514
370
+ Average Acc: 0.7254
371
+ Total Loss: 0.6421
372
+ Instruction Loss: 0.6844
373
+ Quality Loss: 0.5999
374
+ ============================================================
375
+ 2026-01-25 01:10:15 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_1799.pt (filtered to 38.584M trainable parameters)
376
+ 2026-01-25 01:10:15 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_1799.pt (428.0MB)
377
+ 2026-01-25 01:10:15 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_1799.pt
378
+ 2026-01-25 01:10:15 | INFO | Best 3 checkpoints:
379
+ 2026-01-25 01:10:15 | INFO | 1. Step 199: acc=0.7327 (reward_model.best_199.pt)
380
+ 2026-01-25 01:10:15 | INFO | 2. Step 299: acc=0.7312 (reward_model.best_299.pt)
381
+ 2026-01-25 01:10:15 | INFO | 3. Step 399: acc=0.7283 (reward_model.best_399.pt)
382
+ 2026-01-25 01:10:16 | INFO | Step 1800: loss=0.2445 | IF_loss=0.2773, MQ_loss=0.2117 | acc=0.896 (IF=0.854, MQ=0.938) | lr=0.000000
383
+ 2026-01-25 01:12:06 | INFO |
384
+ ============================================================
385
+ Validation Results (took 7.71s):
386
+ Samples: 346 instruction, 346 quality
387
+ Instruction Acc: 0.6994
388
+ Quality Acc: 0.7514
389
+ Average Acc: 0.7254
390
+ Total Loss: 0.6428
391
+ Instruction Loss: 0.6848
392
+ Quality Loss: 0.6007
393
+ ============================================================
394
+ 2026-01-25 01:12:06 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_1899.pt (filtered to 38.584M trainable parameters)
395
+ 2026-01-25 01:12:06 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_1899.pt (428.0MB)
396
+ 2026-01-25 01:12:06 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_1899.pt
397
+ 2026-01-25 01:12:06 | INFO | Best 3 checkpoints:
398
+ 2026-01-25 01:12:06 | INFO | 1. Step 199: acc=0.7327 (reward_model.best_199.pt)
399
+ 2026-01-25 01:12:06 | INFO | 2. Step 299: acc=0.7312 (reward_model.best_299.pt)
400
+ 2026-01-25 01:12:06 | INFO | 3. Step 399: acc=0.7283 (reward_model.best_399.pt)
401
+ 2026-01-25 01:12:07 | INFO | Step 1900: loss=0.2576 | IF_loss=0.2896, MQ_loss=0.2257 | acc=0.833 (IF=0.771, MQ=0.896) | lr=0.000000
402
+ 2026-01-25 01:14:00 | INFO |
403
+ ============================================================
404
+ Validation Results (took 7.55s):
405
+ Samples: 346 instruction, 346 quality
406
+ Instruction Acc: 0.6994
407
+ Quality Acc: 0.7514
408
+ Average Acc: 0.7254
409
+ Total Loss: 0.6428
410
+ Instruction Loss: 0.6847
411
+ Quality Loss: 0.6010
412
+ ============================================================
413
+ 2026-01-25 01:14:00 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_1999.pt (filtered to 38.584M trainable parameters)
414
+ 2026-01-25 01:14:00 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_1999.pt (428.0MB)
415
+ 2026-01-25 01:14:00 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0035/ckpt/reward_model.best_1999.pt
416
+ 2026-01-25 01:14:00 | INFO | Best 3 checkpoints:
417
+ 2026-01-25 01:14:00 | INFO | 1. Step 199: acc=0.7327 (reward_model.best_199.pt)
418
+ 2026-01-25 01:14:00 | INFO | 2. Step 299: acc=0.7312 (reward_model.best_299.pt)
419
+ 2026-01-25 01:14:00 | INFO | 3. Step 399: acc=0.7283 (reward_model.best_399.pt)
420
+ 2026-01-25 01:14:00 | INFO | Training complete!
421
+ 2026-01-25 01:14:00 | INFO | Training complete!
20260125_0037/config.yaml ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ DEVICES: '1'
2
+ accelerate:
3
+ mixed_precision: bf16
4
+ basics:
5
+ random_seed: 42
6
+ save_dir: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model
7
+ dataset:
8
+ audio_dropout:
9
+ apply_to_eval: true
10
+ apply_to_ref: true
11
+ enabled: true
12
+ eval_only_on_training: true
13
+ max_duration: 1500
14
+ min_duration: 200
15
+ train_mode: start
16
+ cache_dir: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/tmp
17
+ db_path: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/backend/database.db
18
+ duration: 600.0
19
+ embedding_dir: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/supervised_embeddings
20
+ max_samples: null
21
+ max_val_samples: null
22
+ metadata_jsonl: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/CMI-Training/all_comparisons.jsonl
23
+ mode: raw_text_frozen_audio
24
+ preference_file: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/CMI-Training/human_annotations/train.json
25
+ sample_rate: 24000
26
+ val_preference_file: null
27
+ loss:
28
+ IF_ratio: 0.5
29
+ filter_ties: true
30
+ label_smoothing: 0.0
31
+ reduction: mean
32
+ model:
33
+ attention_mode: SA
34
+ attn_dropout: 0.0
35
+ category_embeddings: null
36
+ dim: 768
37
+ dim_head: 64
38
+ downsample:
39
+ configs:
40
+ conv2_4x:
41
+ factor: 4
42
+ kernel_size: 5
43
+ kind: conv*2
44
+ use_layernorm: true
45
+ conv_4x:
46
+ factor: 4
47
+ kernel_size: 5
48
+ kind: conv
49
+ stage: 1
50
+ use_layernorm: true
51
+ glu_4x:
52
+ factor: 4
53
+ kernel_size: 5
54
+ kind: gluconv*2+pw
55
+ use_layernorm: true
56
+ mean:
57
+ factor: 2
58
+ kind: mean
59
+ mean_4x:
60
+ dropout: 0.0
61
+ factor: 30
62
+ kind: mean+mlp
63
+ mlp_ratio: 2.0
64
+ none:
65
+ factor: 1
66
+ kind: none
67
+ eval: mean_4x
68
+ ref: null
69
+ text: none
70
+ ff_dropout: 0.0
71
+ ff_mult: 4
72
+ freeze_audio: true
73
+ freeze_text: true
74
+ gradient_checkpointing: false
75
+ heads: 8
76
+ joint_tf_depth: 1
77
+ load_config:
78
+ checkpoint_path: null
79
+ frozen_from_pretrained: true
80
+ pretrained_name: OpenMuQ/MuQ-MuLan-large
81
+ strict: false
82
+ mlp_dim: 768
83
+ mode: concat_text_late
84
+ model_name: OpenMuQ/MuQ-MuLan-large
85
+ name: reward
86
+ no_condition: false
87
+ null_embedding:
88
+ audio:
89
+ dropout: 0.5
90
+ length: 10
91
+ lyrics:
92
+ dropout: 0.3
93
+ length: 10
94
+ text:
95
+ dropout: 0
96
+ length: 10
97
+ output_dim: 2
98
+ prompt_tf_depth: 4
99
+ sr: 24000
100
+ text_encoder:
101
+ name: muq_mulan
102
+ tune: null
103
+ text_lora_config: null
104
+ train_muq_depth: 0
105
+ train_muqmulan: false
106
+ use_audio: true
107
+ use_layer_idx: -1
108
+ project_root: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena
109
+ run_name: null
110
+ train:
111
+ batch_size: 48
112
+ betas:
113
+ - 0.9
114
+ - 0.99
115
+ ema_decay: 0.9999
116
+ ema_update_every: 1
117
+ enable_gradient_checkpointing: true
118
+ force_clear_prev_results: false
119
+ grad_accum_every: 1
120
+ log_tensorboard: true
121
+ lr_schedule:
122
+ min_lr_ratio: 0.001
123
+ name: linear_cosine
124
+ total_steps: 2000
125
+ warmup_steps: 10
126
+ max_grad_norm: 1
127
+ mlp_lr: 1.0e-05
128
+ num_train_steps: 2000
129
+ num_valid_batches: null
130
+ num_workers: 8
131
+ other_lr: 1.0e-05
132
+ resume: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_0147/ckpt/reward_model.0.pt
133
+ resume_optimizer: false
134
+ save_model_every: 2000
135
+ use_checkpoint_config: true
136
+ use_ema: false
137
+ use_lion: false
138
+ valid_batch_size: 20
139
+ valid_every: 100
140
+ valid_frac: 0.1
141
+ verify_weights_on_load: true
142
+ validate_only: false
20260125_0037/eval_results_0125_1713.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
20260125_0037/reward_model/1769272678.832529/events.out.tfevents.1769272678.MACLAB-S004.3414271.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b803b753ca1da01eb8873ce114173c51f03c97c09a2bf8250935c19916c7993
3
+ size 503
20260125_0037/reward_model/1769272678.8337765/hparams.yml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ batch_size: 48
2
+ grad_accum_every: 1
3
+ learning_rate: 1.0e-05
4
+ num_train_steps: 2000
20260125_0037/reward_model/events.out.tfevents.1769272678.MACLAB-S004.3414271.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da51cc565d6934666cc50ed6ca2621a2ab35e5035a94a17869931f3b323adb3e
3
+ size 873949
20260125_0037/train.20260125_0037.log ADDED
@@ -0,0 +1,421 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-25 00:37:47 | INFO | Log file: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/train.20260125_0037.log
2
+ 2026-01-25 00:37:47 | INFO | Random seed set to 42
3
+ 2026-01-25 00:37:49 | INFO | Created RawTextFrozenAudioDataset with 3463 samples
4
+ 2026-01-25 00:37:49 | INFO | Split dataset into train (3117) and validation (346) sets (ratio: 10.00%)
5
+ 2026-01-25 00:37:49 | INFO | Will resume from checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_0147/ckpt/reward_model.0.pt
6
+ 2026-01-25 00:37:49 | INFO | Using checkpoint config for model initialization (continue training mode)
7
+ 2026-01-25 00:37:55 | INFO | Created RewardAttentionModel with attention_mode=SA
8
+ 2026-01-25 00:37:55 | INFO | Created PreferenceLoss with filter_ties=True
9
+ 2026-01-25 00:37:56 | INFO | ✓ Gradient checkpointing enabled
10
+ 2026-01-25 00:37:56 | INFO | ✓ Audio cropping enabled: min=200, max=1500
11
+ 2026-01-25 00:37:56 | INFO | Apply to eval: True, ref: True
12
+ 2026-01-25 00:37:56 | INFO | Modes: train=random, val=start
13
+ 2026-01-25 00:37:56 | INFO | MLP head parameters: 1,186,563 params, lr=1e-05
14
+ 2026-01-25 00:37:56 | INFO | Other parameters: 37,397,634 params, lr=1e-05
15
+ 2026-01-25 00:37:56 | INFO | Using lr_schedule=linear_cosine warmup_steps=10 total_steps=2000
16
+ 2026-01-25 00:37:56 | INFO | Training with fixed validation set
17
+ 2026-01-25 00:37:56 | INFO | Train batch_size: 48, Valid batch_size: 20
18
+ 2026-01-25 00:37:58 | INFO | Missing keys (782): ['text_module.model.embeddings.word_embeddings.weight', 'text_module.model.embeddings.position_embeddings.weight', 'text_module.model.embeddings.token_type_embeddings.weight', 'text_module.model.embeddings.LayerNorm.weight', 'text_module.model.embeddings.LayerNorm.bias']...
19
+ 2026-01-25 00:37:58 | INFO | ✓ Starting from step 0 (transfer learning mode, ignoring checkpoint steps=0)
20
+ 2026-01-25 00:37:58 | INFO | Resumed from /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_0147/ckpt/reward_model.0.pt
21
+ 2026-01-25 00:37:58 | INFO | Parameters: 701.162M total, 38.584M trainable
22
+ 2026-01-25 00:37:58 | INFO | Text encoder (frozen): 328.389M
23
+ 2026-01-25 00:37:58 | INFO | Audio encoder (frozen): 334.189M
24
+ 2026-01-25 00:37:58 | INFO | Other trainable: 38.584M
25
+ 2026-01-25 00:37:58 | INFO | ℹ No LoRA configuration detected
26
+ 2026-01-25 00:37:58 | INFO | ============================================================
27
+ 2026-01-25 00:37:58 | INFO | Ready to start training
28
+ 2026-01-25 00:37:58 | INFO | ============================================================
29
+ 2026-01-25 00:37:58 | INFO | Starting training from step 0
30
+ 2026-01-25 00:37:58 | INFO | ===== Accelerator / CUDA Debug Info =====
31
+ 2026-01-25 00:37:58 | INFO | accelerator.device = cuda
32
+ 2026-01-25 00:37:58 | INFO | mixed_precision = bf16
33
+ 2026-01-25 00:37:58 | INFO | distributed_type = NO
34
+ 2026-01-25 00:37:58 | INFO | num_processes = 1
35
+ 2026-01-25 00:37:58 | INFO | process_index = 0
36
+ 2026-01-25 00:37:58 | INFO | is_main_process = True
37
+ 2026-01-25 00:37:58 | INFO | torch.cuda.is_available() = True
38
+ 2026-01-25 00:37:58 | INFO | torch.cuda.device_count() = 1
39
+ 2026-01-25 00:37:58 | INFO | current_device = 0
40
+ 2026-01-25 00:37:58 | INFO | device_name = NVIDIA GeForce RTX 4090
41
+ 2026-01-25 00:37:58 | INFO | model parameter device = cuda:0
42
+ 2026-01-25 00:37:58 | INFO | Training for 2000.0 steps (~32 epochs, 64 steps/epoch)
43
+ 2026-01-25 00:38:08 | INFO | Step 0: loss=0.6973 | IF_loss=0.6935, MQ_loss=0.7010 | acc=0.510 (IF=0.521, MQ=0.500) | lr=0.000002
44
+ 2026-01-25 00:38:09 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.0.pt (filtered to 38.584M trainable parameters)
45
+ 2026-01-25 00:38:09 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.0.pt (428.0MB)
46
+ 2026-01-25 00:38:09 | INFO | Step 0: Saved to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.0.pt
47
+ 2026-01-25 00:40:04 | INFO |
48
+ ============================================================
49
+ Validation Results (took 11.05s):
50
+ Samples: 346 instruction, 346 quality
51
+ Instruction Acc: 0.6416
52
+ Quality Acc: 0.7312
53
+ Average Acc: 0.6864
54
+ Total Loss: 0.5721
55
+ Instruction Loss: 0.6193
56
+ Quality Loss: 0.5249
57
+ ============================================================
58
+ 2026-01-25 00:40:04 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_99.pt (filtered to 38.584M trainable parameters)
59
+ 2026-01-25 00:40:04 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_99.pt (428.0MB)
60
+ 2026-01-25 00:40:04 | INFO | Best 1 checkpoints:
61
+ 2026-01-25 00:40:04 | INFO | 1. Step 99: acc=0.6864 (reward_model.best_99.pt)
62
+ 2026-01-25 00:40:05 | INFO | Step 100: loss=0.5208 | IF_loss=0.5649, MQ_loss=0.4766 | acc=0.740 (IF=0.708, MQ=0.771) | lr=0.000010
63
+ 2026-01-25 00:42:11 | INFO |
64
+ ============================================================
65
+ Validation Results (took 11.25s):
66
+ Samples: 346 instruction, 346 quality
67
+ Instruction Acc: 0.6705
68
+ Quality Acc: 0.7225
69
+ Average Acc: 0.6965
70
+ Total Loss: 0.5544
71
+ Instruction Loss: 0.5969
72
+ Quality Loss: 0.5120
73
+ ============================================================
74
+ 2026-01-25 00:42:11 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_199.pt (filtered to 38.584M trainable parameters)
75
+ 2026-01-25 00:42:12 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_199.pt (428.0MB)
76
+ 2026-01-25 00:42:12 | INFO | Best 2 checkpoints:
77
+ 2026-01-25 00:42:12 | INFO | 1. Step 199: acc=0.6965 (reward_model.best_199.pt)
78
+ 2026-01-25 00:42:12 | INFO | 2. Step 99: acc=0.6864 (reward_model.best_99.pt)
79
+ 2026-01-25 00:42:13 | INFO | Step 200: loss=0.3984 | IF_loss=0.4045, MQ_loss=0.3923 | acc=0.823 (IF=0.812, MQ=0.833) | lr=0.000010
80
+ 2026-01-25 00:44:08 | INFO |
81
+ ============================================================
82
+ Validation Results (took 8.90s):
83
+ Samples: 346 instruction, 346 quality
84
+ Instruction Acc: 0.6792
85
+ Quality Acc: 0.7370
86
+ Average Acc: 0.7081
87
+ Total Loss: 0.5605
88
+ Instruction Loss: 0.6104
89
+ Quality Loss: 0.5105
90
+ ============================================================
91
+ 2026-01-25 00:44:09 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_299.pt (filtered to 38.584M trainable parameters)
92
+ 2026-01-25 00:44:09 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_299.pt (428.0MB)
93
+ 2026-01-25 00:44:09 | INFO | Best 3 checkpoints:
94
+ 2026-01-25 00:44:09 | INFO | 1. Step 299: acc=0.7081 (reward_model.best_299.pt)
95
+ 2026-01-25 00:44:09 | INFO | 2. Step 199: acc=0.6965 (reward_model.best_199.pt)
96
+ 2026-01-25 00:44:09 | INFO | 3. Step 99: acc=0.6864 (reward_model.best_99.pt)
97
+ 2026-01-25 00:44:11 | INFO | Step 300: loss=0.3611 | IF_loss=0.4409, MQ_loss=0.2813 | acc=0.812 (IF=0.792, MQ=0.833) | lr=0.000009
98
+ 2026-01-25 00:46:11 | INFO |
99
+ ============================================================
100
+ Validation Results (took 8.37s):
101
+ Samples: 346 instruction, 346 quality
102
+ Instruction Acc: 0.6908
103
+ Quality Acc: 0.7168
104
+ Average Acc: 0.7038
105
+ Total Loss: 0.5862
106
+ Instruction Loss: 0.6208
107
+ Quality Loss: 0.5516
108
+ ============================================================
109
+ 2026-01-25 00:46:11 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_399.pt (filtered to 38.584M trainable parameters)
110
+ 2026-01-25 00:46:12 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_399.pt (428.0MB)
111
+ 2026-01-25 00:46:12 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_99.pt
112
+ 2026-01-25 00:46:12 | INFO | Best 3 checkpoints:
113
+ 2026-01-25 00:46:12 | INFO | 1. Step 299: acc=0.7081 (reward_model.best_299.pt)
114
+ 2026-01-25 00:46:12 | INFO | 2. Step 399: acc=0.7038 (reward_model.best_399.pt)
115
+ 2026-01-25 00:46:12 | INFO | 3. Step 199: acc=0.6965 (reward_model.best_199.pt)
116
+ 2026-01-25 00:46:13 | INFO | Step 400: loss=0.3193 | IF_loss=0.3378, MQ_loss=0.3007 | acc=0.865 (IF=0.833, MQ=0.896) | lr=0.000009
117
+ 2026-01-25 00:48:10 | INFO |
118
+ ============================================================
119
+ Validation Results (took 8.01s):
120
+ Samples: 346 instruction, 346 quality
121
+ Instruction Acc: 0.6763
122
+ Quality Acc: 0.7312
123
+ Average Acc: 0.7038
124
+ Total Loss: 0.5854
125
+ Instruction Loss: 0.6252
126
+ Quality Loss: 0.5457
127
+ ============================================================
128
+ 2026-01-25 00:48:11 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_499.pt (filtered to 38.584M trainable parameters)
129
+ 2026-01-25 00:48:11 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_499.pt (428.0MB)
130
+ 2026-01-25 00:48:11 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_199.pt
131
+ 2026-01-25 00:48:11 | INFO | Best 3 checkpoints:
132
+ 2026-01-25 00:48:11 | INFO | 1. Step 299: acc=0.7081 (reward_model.best_299.pt)
133
+ 2026-01-25 00:48:11 | INFO | 2. Step 399: acc=0.7038 (reward_model.best_399.pt)
134
+ 2026-01-25 00:48:11 | INFO | 3. Step 499: acc=0.7038 (reward_model.best_499.pt)
135
+ 2026-01-25 00:48:12 | INFO | Step 500: loss=0.3185 | IF_loss=0.3553, MQ_loss=0.2816 | acc=0.844 (IF=0.875, MQ=0.812) | lr=0.000009
136
+ 2026-01-25 00:50:10 | INFO |
137
+ ============================================================
138
+ Validation Results (took 7.80s):
139
+ Samples: 346 instruction, 346 quality
140
+ Instruction Acc: 0.6792
141
+ Quality Acc: 0.7486
142
+ Average Acc: 0.7139
143
+ Total Loss: 0.5868
144
+ Instruction Loss: 0.6327
145
+ Quality Loss: 0.5409
146
+ ============================================================
147
+ 2026-01-25 00:50:10 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_599.pt (filtered to 38.584M trainable parameters)
148
+ 2026-01-25 00:50:11 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_599.pt (428.0MB)
149
+ 2026-01-25 00:50:11 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_499.pt
150
+ 2026-01-25 00:50:11 | INFO | Best 3 checkpoints:
151
+ 2026-01-25 00:50:11 | INFO | 1. Step 599: acc=0.7139 (reward_model.best_599.pt)
152
+ 2026-01-25 00:50:11 | INFO | 2. Step 299: acc=0.7081 (reward_model.best_299.pt)
153
+ 2026-01-25 00:50:11 | INFO | 3. Step 399: acc=0.7038 (reward_model.best_399.pt)
154
+ 2026-01-25 00:50:12 | INFO | Step 600: loss=0.3412 | IF_loss=0.3309, MQ_loss=0.3515 | acc=0.844 (IF=0.875, MQ=0.812) | lr=0.000008
155
+ 2026-01-25 00:52:10 | INFO |
156
+ ============================================================
157
+ Validation Results (took 8.05s):
158
+ Samples: 346 instruction, 346 quality
159
+ Instruction Acc: 0.6850
160
+ Quality Acc: 0.7399
161
+ Average Acc: 0.7124
162
+ Total Loss: 0.6273
163
+ Instruction Loss: 0.6640
164
+ Quality Loss: 0.5907
165
+ ============================================================
166
+ 2026-01-25 00:52:10 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_699.pt (filtered to 38.584M trainable parameters)
167
+ 2026-01-25 00:52:11 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_699.pt (428.0MB)
168
+ 2026-01-25 00:52:11 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_399.pt
169
+ 2026-01-25 00:52:11 | INFO | Best 3 checkpoints:
170
+ 2026-01-25 00:52:11 | INFO | 1. Step 599: acc=0.7139 (reward_model.best_599.pt)
171
+ 2026-01-25 00:52:11 | INFO | 2. Step 699: acc=0.7124 (reward_model.best_699.pt)
172
+ 2026-01-25 00:52:11 | INFO | 3. Step 299: acc=0.7081 (reward_model.best_299.pt)
173
+ 2026-01-25 00:52:12 | INFO | Step 700: loss=0.1745 | IF_loss=0.2334, MQ_loss=0.1156 | acc=0.917 (IF=0.875, MQ=0.958) | lr=0.000007
174
+ 2026-01-25 00:54:13 | INFO |
175
+ ============================================================
176
+ Validation Results (took 8.54s):
177
+ Samples: 346 instruction, 346 quality
178
+ Instruction Acc: 0.6590
179
+ Quality Acc: 0.7341
180
+ Average Acc: 0.6965
181
+ Total Loss: 0.6533
182
+ Instruction Loss: 0.6973
183
+ Quality Loss: 0.6092
184
+ ============================================================
185
+ 2026-01-25 00:54:13 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_799.pt (filtered to 38.584M trainable parameters)
186
+ 2026-01-25 00:54:14 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_799.pt (428.0MB)
187
+ 2026-01-25 00:54:14 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_799.pt
188
+ 2026-01-25 00:54:14 | INFO | Best 3 checkpoints:
189
+ 2026-01-25 00:54:14 | INFO | 1. Step 599: acc=0.7139 (reward_model.best_599.pt)
190
+ 2026-01-25 00:54:14 | INFO | 2. Step 699: acc=0.7124 (reward_model.best_699.pt)
191
+ 2026-01-25 00:54:14 | INFO | 3. Step 299: acc=0.7081 (reward_model.best_299.pt)
192
+ 2026-01-25 00:54:15 | INFO | Step 800: loss=0.2953 | IF_loss=0.3655, MQ_loss=0.2252 | acc=0.875 (IF=0.833, MQ=0.917) | lr=0.000007
193
+ 2026-01-25 00:56:15 | INFO |
194
+ ============================================================
195
+ Validation Results (took 8.04s):
196
+ Samples: 346 instruction, 346 quality
197
+ Instruction Acc: 0.6590
198
+ Quality Acc: 0.7283
199
+ Average Acc: 0.6936
200
+ Total Loss: 0.6663
201
+ Instruction Loss: 0.7004
202
+ Quality Loss: 0.6321
203
+ ============================================================
204
+ 2026-01-25 00:56:15 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_899.pt (filtered to 38.584M trainable parameters)
205
+ 2026-01-25 00:56:16 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_899.pt (428.0MB)
206
+ 2026-01-25 00:56:16 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_899.pt
207
+ 2026-01-25 00:56:16 | INFO | Best 3 checkpoints:
208
+ 2026-01-25 00:56:16 | INFO | 1. Step 599: acc=0.7139 (reward_model.best_599.pt)
209
+ 2026-01-25 00:56:16 | INFO | 2. Step 699: acc=0.7124 (reward_model.best_699.pt)
210
+ 2026-01-25 00:56:16 | INFO | 3. Step 299: acc=0.7081 (reward_model.best_299.pt)
211
+ 2026-01-25 00:56:17 | INFO | Step 900: loss=0.1683 | IF_loss=0.1746, MQ_loss=0.1621 | acc=0.938 (IF=0.958, MQ=0.917) | lr=0.000006
212
+ 2026-01-25 00:58:18 | INFO |
213
+ ============================================================
214
+ Validation Results (took 9.16s):
215
+ Samples: 346 instruction, 346 quality
216
+ Instruction Acc: 0.6561
217
+ Quality Acc: 0.7428
218
+ Average Acc: 0.6994
219
+ Total Loss: 0.6976
220
+ Instruction Loss: 0.7340
221
+ Quality Loss: 0.6612
222
+ ============================================================
223
+ 2026-01-25 00:58:19 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_999.pt (filtered to 38.584M trainable parameters)
224
+ 2026-01-25 00:58:19 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_999.pt (428.0MB)
225
+ 2026-01-25 00:58:19 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_999.pt
226
+ 2026-01-25 00:58:19 | INFO | Best 3 checkpoints:
227
+ 2026-01-25 00:58:19 | INFO | 1. Step 599: acc=0.7139 (reward_model.best_599.pt)
228
+ 2026-01-25 00:58:19 | INFO | 2. Step 699: acc=0.7124 (reward_model.best_699.pt)
229
+ 2026-01-25 00:58:19 | INFO | 3. Step 299: acc=0.7081 (reward_model.best_299.pt)
230
+ 2026-01-25 00:58:20 | INFO | Step 1000: loss=0.1489 | IF_loss=0.1420, MQ_loss=0.1559 | acc=0.948 (IF=0.938, MQ=0.958) | lr=0.000005
231
+ 2026-01-25 01:00:15 | INFO |
232
+ ============================================================
233
+ Validation Results (took 7.24s):
234
+ Samples: 346 instruction, 346 quality
235
+ Instruction Acc: 0.6590
236
+ Quality Acc: 0.7312
237
+ Average Acc: 0.6951
238
+ Total Loss: 0.7224
239
+ Instruction Loss: 0.7648
240
+ Quality Loss: 0.6801
241
+ ============================================================
242
+ 2026-01-25 01:00:15 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_1099.pt (filtered to 38.584M trainable parameters)
243
+ 2026-01-25 01:00:15 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_1099.pt (428.0MB)
244
+ 2026-01-25 01:00:16 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_1099.pt
245
+ 2026-01-25 01:00:16 | INFO | Best 3 checkpoints:
246
+ 2026-01-25 01:00:16 | INFO | 1. Step 599: acc=0.7139 (reward_model.best_599.pt)
247
+ 2026-01-25 01:00:16 | INFO | 2. Step 699: acc=0.7124 (reward_model.best_699.pt)
248
+ 2026-01-25 01:00:16 | INFO | 3. Step 299: acc=0.7081 (reward_model.best_299.pt)
249
+ 2026-01-25 01:00:16 | INFO | Step 1100: loss=0.1252 | IF_loss=0.1115, MQ_loss=0.1390 | acc=0.958 (IF=0.979, MQ=0.938) | lr=0.000004
250
+ 2026-01-25 01:02:06 | INFO |
251
+ ============================================================
252
+ Validation Results (took 7.27s):
253
+ Samples: 346 instruction, 346 quality
254
+ Instruction Acc: 0.6503
255
+ Quality Acc: 0.7283
256
+ Average Acc: 0.6893
257
+ Total Loss: 0.7565
258
+ Instruction Loss: 0.8083
259
+ Quality Loss: 0.7047
260
+ ============================================================
261
+ 2026-01-25 01:02:06 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_1199.pt (filtered to 38.584M trainable parameters)
262
+ 2026-01-25 01:02:06 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_1199.pt (428.0MB)
263
+ 2026-01-25 01:02:06 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_1199.pt
264
+ 2026-01-25 01:02:06 | INFO | Best 3 checkpoints:
265
+ 2026-01-25 01:02:06 | INFO | 1. Step 599: acc=0.7139 (reward_model.best_599.pt)
266
+ 2026-01-25 01:02:06 | INFO | 2. Step 699: acc=0.7124 (reward_model.best_699.pt)
267
+ 2026-01-25 01:02:06 | INFO | 3. Step 299: acc=0.7081 (reward_model.best_299.pt)
268
+ 2026-01-25 01:02:07 | INFO | Step 1200: loss=0.1319 | IF_loss=0.1250, MQ_loss=0.1388 | acc=0.896 (IF=0.875, MQ=0.917) | lr=0.000003
269
+ 2026-01-25 01:04:00 | INFO |
270
+ ============================================================
271
+ Validation Results (took 7.46s):
272
+ Samples: 346 instruction, 346 quality
273
+ Instruction Acc: 0.6647
274
+ Quality Acc: 0.7283
275
+ Average Acc: 0.6965
276
+ Total Loss: 0.7569
277
+ Instruction Loss: 0.8060
278
+ Quality Loss: 0.7079
279
+ ============================================================
280
+ 2026-01-25 01:04:00 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_1299.pt (filtered to 38.584M trainable parameters)
281
+ 2026-01-25 01:04:00 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_1299.pt (428.0MB)
282
+ 2026-01-25 01:04:01 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_1299.pt
283
+ 2026-01-25 01:04:01 | INFO | Best 3 checkpoints:
284
+ 2026-01-25 01:04:01 | INFO | 1. Step 599: acc=0.7139 (reward_model.best_599.pt)
285
+ 2026-01-25 01:04:01 | INFO | 2. Step 699: acc=0.7124 (reward_model.best_699.pt)
286
+ 2026-01-25 01:04:01 | INFO | 3. Step 299: acc=0.7081 (reward_model.best_299.pt)
287
+ 2026-01-25 01:04:02 | INFO | Step 1300: loss=0.1072 | IF_loss=0.1049, MQ_loss=0.1095 | acc=0.958 (IF=0.979, MQ=0.938) | lr=0.000003
288
+ 2026-01-25 01:05:51 | INFO |
289
+ ============================================================
290
+ Validation Results (took 7.30s):
291
+ Samples: 346 instruction, 346 quality
292
+ Instruction Acc: 0.6445
293
+ Quality Acc: 0.7254
294
+ Average Acc: 0.6850
295
+ Total Loss: 0.7646
296
+ Instruction Loss: 0.8179
297
+ Quality Loss: 0.7114
298
+ ============================================================
299
+ 2026-01-25 01:05:51 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_1399.pt (filtered to 38.584M trainable parameters)
300
+ 2026-01-25 01:05:52 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_1399.pt (428.0MB)
301
+ 2026-01-25 01:05:52 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_1399.pt
302
+ 2026-01-25 01:05:52 | INFO | Best 3 checkpoints:
303
+ 2026-01-25 01:05:52 | INFO | 1. Step 599: acc=0.7139 (reward_model.best_599.pt)
304
+ 2026-01-25 01:05:52 | INFO | 2. Step 699: acc=0.7124 (reward_model.best_699.pt)
305
+ 2026-01-25 01:05:52 | INFO | 3. Step 299: acc=0.7081 (reward_model.best_299.pt)
306
+ 2026-01-25 01:05:53 | INFO | Step 1400: loss=0.1399 | IF_loss=0.1608, MQ_loss=0.1191 | acc=0.969 (IF=0.958, MQ=0.979) | lr=0.000002
307
+ 2026-01-25 01:07:49 | INFO |
308
+ ============================================================
309
+ Validation Results (took 7.28s):
310
+ Samples: 346 instruction, 346 quality
311
+ Instruction Acc: 0.6474
312
+ Quality Acc: 0.7370
313
+ Average Acc: 0.6922
314
+ Total Loss: 0.7650
315
+ Instruction Loss: 0.8168
316
+ Quality Loss: 0.7131
317
+ ============================================================
318
+ 2026-01-25 01:07:49 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_1499.pt (filtered to 38.584M trainable parameters)
319
+ 2026-01-25 01:07:50 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_1499.pt (428.0MB)
320
+ 2026-01-25 01:07:50 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_1499.pt
321
+ 2026-01-25 01:07:50 | INFO | Best 3 checkpoints:
322
+ 2026-01-25 01:07:50 | INFO | 1. Step 599: acc=0.7139 (reward_model.best_599.pt)
323
+ 2026-01-25 01:07:50 | INFO | 2. Step 699: acc=0.7124 (reward_model.best_699.pt)
324
+ 2026-01-25 01:07:50 | INFO | 3. Step 299: acc=0.7081 (reward_model.best_299.pt)
325
+ 2026-01-25 01:07:51 | INFO | Step 1500: loss=0.0816 | IF_loss=0.1031, MQ_loss=0.0600 | acc=0.969 (IF=0.979, MQ=0.958) | lr=0.000001
326
+ 2026-01-25 01:09:42 | INFO |
327
+ ============================================================
328
+ Validation Results (took 7.73s):
329
+ Samples: 346 instruction, 346 quality
330
+ Instruction Acc: 0.6445
331
+ Quality Acc: 0.7254
332
+ Average Acc: 0.6850
333
+ Total Loss: 0.7697
334
+ Instruction Loss: 0.8237
335
+ Quality Loss: 0.7156
336
+ ============================================================
337
+ 2026-01-25 01:09:42 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_1599.pt (filtered to 38.584M trainable parameters)
338
+ 2026-01-25 01:09:42 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_1599.pt (428.0MB)
339
+ 2026-01-25 01:09:42 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_1599.pt
340
+ 2026-01-25 01:09:42 | INFO | Best 3 checkpoints:
341
+ 2026-01-25 01:09:42 | INFO | 1. Step 599: acc=0.7139 (reward_model.best_599.pt)
342
+ 2026-01-25 01:09:42 | INFO | 2. Step 699: acc=0.7124 (reward_model.best_699.pt)
343
+ 2026-01-25 01:09:42 | INFO | 3. Step 299: acc=0.7081 (reward_model.best_299.pt)
344
+ 2026-01-25 01:09:47 | INFO | Step 1600: loss=0.0867 | IF_loss=0.0924, MQ_loss=0.0809 | acc=0.990 (IF=0.979, MQ=1.000) | lr=0.000001
345
+ 2026-01-25 01:11:40 | INFO |
346
+ ============================================================
347
+ Validation Results (took 8.52s):
348
+ Samples: 346 instruction, 346 quality
349
+ Instruction Acc: 0.6532
350
+ Quality Acc: 0.7283
351
+ Average Acc: 0.6908
352
+ Total Loss: 0.7751
353
+ Instruction Loss: 0.8262
354
+ Quality Loss: 0.7239
355
+ ============================================================
356
+ 2026-01-25 01:11:40 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_1699.pt (filtered to 38.584M trainable parameters)
357
+ 2026-01-25 01:11:41 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_1699.pt (428.0MB)
358
+ 2026-01-25 01:11:41 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_1699.pt
359
+ 2026-01-25 01:11:41 | INFO | Best 3 checkpoints:
360
+ 2026-01-25 01:11:41 | INFO | 1. Step 599: acc=0.7139 (reward_model.best_599.pt)
361
+ 2026-01-25 01:11:41 | INFO | 2. Step 699: acc=0.7124 (reward_model.best_699.pt)
362
+ 2026-01-25 01:11:41 | INFO | 3. Step 299: acc=0.7081 (reward_model.best_299.pt)
363
+ 2026-01-25 01:11:42 | INFO | Step 1700: loss=0.1204 | IF_loss=0.0824, MQ_loss=0.1585 | acc=0.927 (IF=0.979, MQ=0.875) | lr=0.000001
364
+ 2026-01-25 01:13:36 | INFO |
365
+ ============================================================
366
+ Validation Results (took 7.21s):
367
+ Samples: 346 instruction, 346 quality
368
+ Instruction Acc: 0.6590
369
+ Quality Acc: 0.7283
370
+ Average Acc: 0.6936
371
+ Total Loss: 0.7810
372
+ Instruction Loss: 0.8338
373
+ Quality Loss: 0.7282
374
+ ============================================================
375
+ 2026-01-25 01:13:36 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_1799.pt (filtered to 38.584M trainable parameters)
376
+ 2026-01-25 01:13:36 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_1799.pt (428.0MB)
377
+ 2026-01-25 01:13:37 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_1799.pt
378
+ 2026-01-25 01:13:37 | INFO | Best 3 checkpoints:
379
+ 2026-01-25 01:13:37 | INFO | 1. Step 599: acc=0.7139 (reward_model.best_599.pt)
380
+ 2026-01-25 01:13:37 | INFO | 2. Step 699: acc=0.7124 (reward_model.best_699.pt)
381
+ 2026-01-25 01:13:37 | INFO | 3. Step 299: acc=0.7081 (reward_model.best_299.pt)
382
+ 2026-01-25 01:13:38 | INFO | Step 1800: loss=0.0594 | IF_loss=0.0719, MQ_loss=0.0470 | acc=0.979 (IF=0.958, MQ=1.000) | lr=0.000000
383
+ 2026-01-25 01:15:28 | INFO |
384
+ ============================================================
385
+ Validation Results (took 7.61s):
386
+ Samples: 346 instruction, 346 quality
387
+ Instruction Acc: 0.6532
388
+ Quality Acc: 0.7283
389
+ Average Acc: 0.6908
390
+ Total Loss: 0.7827
391
+ Instruction Loss: 0.8356
392
+ Quality Loss: 0.7299
393
+ ============================================================
394
+ 2026-01-25 01:15:28 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_1899.pt (filtered to 38.584M trainable parameters)
395
+ 2026-01-25 01:15:28 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_1899.pt (428.0MB)
396
+ 2026-01-25 01:15:28 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_1899.pt
397
+ 2026-01-25 01:15:28 | INFO | Best 3 checkpoints:
398
+ 2026-01-25 01:15:28 | INFO | 1. Step 599: acc=0.7139 (reward_model.best_599.pt)
399
+ 2026-01-25 01:15:28 | INFO | 2. Step 699: acc=0.7124 (reward_model.best_699.pt)
400
+ 2026-01-25 01:15:28 | INFO | 3. Step 299: acc=0.7081 (reward_model.best_299.pt)
401
+ 2026-01-25 01:15:29 | INFO | Step 1900: loss=0.1343 | IF_loss=0.1457, MQ_loss=0.1229 | acc=0.906 (IF=0.896, MQ=0.917) | lr=0.000000
402
+ 2026-01-25 01:17:19 | INFO |
403
+ ============================================================
404
+ Validation Results (took 6.92s):
405
+ Samples: 346 instruction, 346 quality
406
+ Instruction Acc: 0.6532
407
+ Quality Acc: 0.7312
408
+ Average Acc: 0.6922
409
+ Total Loss: 0.7832
410
+ Instruction Loss: 0.8361
411
+ Quality Loss: 0.7304
412
+ ============================================================
413
+ 2026-01-25 01:17:19 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_1999.pt (filtered to 38.584M trainable parameters)
414
+ 2026-01-25 01:17:20 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_1999.pt (428.0MB)
415
+ 2026-01-25 01:17:20 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0037/ckpt/reward_model.best_1999.pt
416
+ 2026-01-25 01:17:20 | INFO | Best 3 checkpoints:
417
+ 2026-01-25 01:17:20 | INFO | 1. Step 599: acc=0.7139 (reward_model.best_599.pt)
418
+ 2026-01-25 01:17:20 | INFO | 2. Step 699: acc=0.7124 (reward_model.best_699.pt)
419
+ 2026-01-25 01:17:20 | INFO | 3. Step 299: acc=0.7081 (reward_model.best_299.pt)
420
+ 2026-01-25 01:17:20 | INFO | Training complete!
421
+ 2026-01-25 01:17:20 | INFO | Training complete!
20260125_0038/config.yaml ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ DEVICES: '3'
2
+ accelerate:
3
+ mixed_precision: bf16
4
+ basics:
5
+ random_seed: 42
6
+ save_dir: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model
7
+ dataset:
8
+ audio_dropout:
9
+ apply_to_eval: true
10
+ apply_to_ref: true
11
+ enabled: true
12
+ eval_only_on_training: true
13
+ max_duration: 1500
14
+ min_duration: 200
15
+ train_mode: start
16
+ cache_dir: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/tmp
17
+ db_path: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/backend/database.db
18
+ duration: 600.0
19
+ embedding_dir: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/supervised_embeddings
20
+ max_samples: null
21
+ max_val_samples: null
22
+ metadata_jsonl: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/CMI-Training/all_comparisons.jsonl
23
+ mode: raw_text_frozen_audio
24
+ preference_file: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/CMI-Training/human_annotations/train.json
25
+ sample_rate: 24000
26
+ val_preference_file: null
27
+ loss:
28
+ IF_ratio: 0.5
29
+ filter_ties: true
30
+ label_smoothing: 0.0
31
+ reduction: mean
32
+ model:
33
+ attention_mode: SA
34
+ attn_dropout: 0.0
35
+ category_embeddings: null
36
+ dim: 768
37
+ dim_head: 64
38
+ downsample:
39
+ configs:
40
+ conv2_4x:
41
+ factor: 4
42
+ kernel_size: 5
43
+ kind: conv*2
44
+ use_layernorm: true
45
+ conv_4x:
46
+ factor: 4
47
+ kernel_size: 5
48
+ kind: conv
49
+ stage: 1
50
+ use_layernorm: true
51
+ glu_4x:
52
+ factor: 4
53
+ kernel_size: 5
54
+ kind: gluconv*2+pw
55
+ use_layernorm: true
56
+ mean:
57
+ factor: 2
58
+ kind: mean
59
+ mean_4x:
60
+ dropout: 0.0
61
+ factor: 30
62
+ kind: mean+mlp
63
+ mlp_ratio: 2.0
64
+ none:
65
+ factor: 1
66
+ kind: none
67
+ eval: mean_4x
68
+ ref: null
69
+ text: none
70
+ ff_dropout: 0.0
71
+ ff_mult: 4
72
+ freeze_audio: true
73
+ freeze_text: true
74
+ gradient_checkpointing: false
75
+ heads: 8
76
+ joint_tf_depth: 1
77
+ load_config:
78
+ checkpoint_path: null
79
+ frozen_from_pretrained: true
80
+ pretrained_name: OpenMuQ/MuQ-MuLan-large
81
+ strict: false
82
+ mlp_dim: 768
83
+ mode: concat_text_late
84
+ model_name: OpenMuQ/MuQ-MuLan-large
85
+ name: reward
86
+ no_condition: false
87
+ null_embedding:
88
+ audio:
89
+ dropout: 0.5
90
+ length: 10
91
+ lyrics:
92
+ dropout: 0.3
93
+ length: 10
94
+ text:
95
+ dropout: 0
96
+ length: 10
97
+ output_dim: 2
98
+ prompt_tf_depth: 4
99
+ sr: 24000
100
+ text_encoder:
101
+ name: muq_mulan
102
+ tune: null
103
+ text_lora_config: null
104
+ train_muq_depth: 0
105
+ train_muqmulan: false
106
+ use_audio: true
107
+ use_layer_idx: -1
108
+ project_root: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena
109
+ run_name: null
110
+ train:
111
+ batch_size: 48
112
+ betas:
113
+ - 0.9
114
+ - 0.99
115
+ ema_decay: 0.9999
116
+ ema_update_every: 1
117
+ enable_gradient_checkpointing: true
118
+ force_clear_prev_results: false
119
+ grad_accum_every: 1
120
+ log_tensorboard: true
121
+ lr_schedule:
122
+ min_lr_ratio: 0.001
123
+ name: linear_cosine
124
+ total_steps: 2000
125
+ warmup_steps: 10
126
+ max_grad_norm: 1
127
+ mlp_lr: 1.0e-05
128
+ num_train_steps: 2000
129
+ num_valid_batches: null
130
+ num_workers: 8
131
+ other_lr: 1.0e-05
132
+ resume: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_0147/ckpt/reward_model.20000.pt
133
+ resume_optimizer: false
134
+ save_model_every: 2000
135
+ use_checkpoint_config: true
136
+ use_ema: false
137
+ use_lion: false
138
+ valid_batch_size: 20
139
+ valid_every: 100
140
+ valid_frac: 0.1
141
+ verify_weights_on_load: true
142
+ validate_only: false
20260125_0038/reward_model/1769272741.4481056/events.out.tfevents.1769272741.MACLAB-S004.3419169.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b05b3493f3a74ca2aaaf8a9d4104cacb90a38935cf1f11482a5dd926ef450af
3
+ size 503
20260125_0038/reward_model/1769272741.4495451/hparams.yml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ batch_size: 48
2
+ grad_accum_every: 1
3
+ learning_rate: 1.0e-05
4
+ num_train_steps: 2000
20260125_0038/reward_model/events.out.tfevents.1769272741.MACLAB-S004.3419169.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87b529d6d3202ede1b59405b16990059f5aa626adc0cd7c689cc5f35b07c43d6
3
+ size 428856
20260125_0038/train.20260125_0038.log ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-25 00:38:50 | INFO | Log file: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0038/train.20260125_0038.log
2
+ 2026-01-25 00:38:50 | INFO | Random seed set to 42
3
+ 2026-01-25 00:38:51 | INFO | Created RawTextFrozenAudioDataset with 3463 samples
4
+ 2026-01-25 00:38:51 | INFO | Split dataset into train (3117) and validation (346) sets (ratio: 10.00%)
5
+ 2026-01-25 00:38:51 | INFO | Will resume from checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_0147/ckpt/reward_model.20000.pt
6
+ 2026-01-25 00:38:51 | INFO | Using checkpoint config for model initialization (continue training mode)
7
+ 2026-01-25 00:38:59 | INFO | Created RewardAttentionModel with attention_mode=SA
8
+ 2026-01-25 00:38:59 | INFO | Created PreferenceLoss with filter_ties=True
9
+ 2026-01-25 00:38:59 | INFO | ✓ Gradient checkpointing enabled
10
+ 2026-01-25 00:38:59 | INFO | ✓ Audio cropping enabled: min=200, max=1500
11
+ 2026-01-25 00:38:59 | INFO | Apply to eval: True, ref: True
12
+ 2026-01-25 00:38:59 | INFO | Modes: train=random, val=start
13
+ 2026-01-25 00:38:59 | INFO | MLP head parameters: 1,186,563 params, lr=1e-05
14
+ 2026-01-25 00:38:59 | INFO | Other parameters: 37,397,634 params, lr=1e-05
15
+ 2026-01-25 00:38:59 | INFO | Using lr_schedule=linear_cosine warmup_steps=10 total_steps=2000
16
+ 2026-01-25 00:38:59 | INFO | Training with fixed validation set
17
+ 2026-01-25 00:38:59 | INFO | Train batch_size: 48, Valid batch_size: 20
18
+ 2026-01-25 00:39:00 | INFO | Missing keys (782): ['text_module.model.embeddings.word_embeddings.weight', 'text_module.model.embeddings.position_embeddings.weight', 'text_module.model.embeddings.token_type_embeddings.weight', 'text_module.model.embeddings.LayerNorm.weight', 'text_module.model.embeddings.LayerNorm.bias']...
19
+ 2026-01-25 00:39:00 | INFO | ✓ Starting from step 0 (transfer learning mode, ignoring checkpoint steps=20000)
20
+ 2026-01-25 00:39:00 | INFO | Resumed from /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_0147/ckpt/reward_model.20000.pt
21
+ 2026-01-25 00:39:00 | INFO | Parameters: 701.162M total, 38.584M trainable
22
+ 2026-01-25 00:39:00 | INFO | Text encoder (frozen): 328.389M
23
+ 2026-01-25 00:39:00 | INFO | Audio encoder (frozen): 334.189M
24
+ 2026-01-25 00:39:00 | INFO | Other trainable: 38.584M
25
+ 2026-01-25 00:39:00 | INFO | ℹ No LoRA configuration detected
26
+ 2026-01-25 00:39:01 | INFO | ============================================================
27
+ 2026-01-25 00:39:01 | INFO | Ready to start training
28
+ 2026-01-25 00:39:01 | INFO | ============================================================
29
+ 2026-01-25 00:39:01 | INFO | Starting training from step 0
30
+ 2026-01-25 00:39:01 | INFO | ===== Accelerator / CUDA Debug Info =====
31
+ 2026-01-25 00:39:01 | INFO | accelerator.device = cuda
32
+ 2026-01-25 00:39:01 | INFO | mixed_precision = bf16
33
+ 2026-01-25 00:39:01 | INFO | distributed_type = NO
34
+ 2026-01-25 00:39:01 | INFO | num_processes = 1
35
+ 2026-01-25 00:39:01 | INFO | process_index = 0
36
+ 2026-01-25 00:39:01 | INFO | is_main_process = True
37
+ 2026-01-25 00:39:01 | INFO | torch.cuda.is_available() = True
38
+ 2026-01-25 00:39:01 | INFO | torch.cuda.device_count() = 1
39
+ 2026-01-25 00:39:01 | INFO | current_device = 0
40
+ 2026-01-25 00:39:01 | INFO | device_name = NVIDIA GeForce RTX 4090
41
+ 2026-01-25 00:39:01 | INFO | model parameter device = cuda:0
42
+ 2026-01-25 00:39:01 | INFO | Training for 2000.0 steps (~32 epochs, 64 steps/epoch)
43
+ 2026-01-25 00:39:12 | INFO | Step 0: loss=1.3478 | IF_loss=1.7487, MQ_loss=0.9469 | acc=0.719 (IF=0.667, MQ=0.771) | lr=0.000002
44
+ 2026-01-25 00:39:12 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0038/ckpt/reward_model.0.pt (filtered to 38.584M trainable parameters)
45
+ 2026-01-25 00:39:13 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0038/ckpt/reward_model.0.pt (428.0MB)
46
+ 2026-01-25 00:39:13 | INFO | Step 0: Saved to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0038/ckpt/reward_model.0.pt
47
+ 2026-01-25 00:41:09 | INFO |
48
+ ============================================================
49
+ Validation Results (took 10.74s):
50
+ Samples: 346 instruction, 346 quality
51
+ Instruction Acc: 0.7052
52
+ Quality Acc: 0.7139
53
+ Average Acc: 0.7095
54
+ Total Loss: 0.9740
55
+ Instruction Loss: 0.9900
56
+ Quality Loss: 0.9581
57
+ ============================================================
58
+ 2026-01-25 00:41:09 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0038/ckpt/reward_model.best_99.pt (filtered to 38.584M trainable parameters)
59
+ 2026-01-25 00:41:09 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0038/ckpt/reward_model.best_99.pt (428.0MB)
60
+ 2026-01-25 00:41:09 | INFO | Best 1 checkpoints:
61
+ 2026-01-25 00:41:09 | INFO | 1. Step 99: acc=0.7095 (reward_model.best_99.pt)
62
+ 2026-01-25 00:41:10 | INFO | Step 100: loss=0.8833 | IF_loss=0.7300, MQ_loss=1.0365 | acc=0.688 (IF=0.708, MQ=0.667) | lr=0.000010
63
+ 2026-01-25 00:43:08 | INFO |
64
+ ============================================================
65
+ Validation Results (took 7.91s):
66
+ Samples: 346 instruction, 346 quality
67
+ Instruction Acc: 0.7139
68
+ Quality Acc: 0.7370
69
+ Average Acc: 0.7254
70
+ Total Loss: 0.6643
71
+ Instruction Loss: 0.6989
72
+ Quality Loss: 0.6297
73
+ ============================================================
74
+ 2026-01-25 00:43:08 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0038/ckpt/reward_model.best_199.pt (filtered to 38.584M trainable parameters)
75
+ 2026-01-25 00:43:08 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0038/ckpt/reward_model.best_199.pt (428.0MB)
76
+ 2026-01-25 00:43:08 | INFO | Best 2 checkpoints:
77
+ 2026-01-25 00:43:08 | INFO | 1. Step 199: acc=0.7254 (reward_model.best_199.pt)
78
+ 2026-01-25 00:43:08 | INFO | 2. Step 99: acc=0.7095 (reward_model.best_99.pt)
79
+ 2026-01-25 00:43:09 | INFO | Step 200: loss=0.3416 | IF_loss=0.3084, MQ_loss=0.3748 | acc=0.823 (IF=0.854, MQ=0.792) | lr=0.000010
80
+ 2026-01-25 00:45:01 | INFO |
81
+ ============================================================
82
+ Validation Results (took 8.58s):
83
+ Samples: 346 instruction, 346 quality
84
+ Instruction Acc: 0.7168
85
+ Quality Acc: 0.7543
86
+ Average Acc: 0.7355
87
+ Total Loss: 0.6117
88
+ Instruction Loss: 0.6554
89
+ Quality Loss: 0.5680
90
+ ============================================================
91
+ 2026-01-25 00:45:01 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0038/ckpt/reward_model.best_299.pt (filtered to 38.584M trainable parameters)
92
+ 2026-01-25 00:45:01 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0038/ckpt/reward_model.best_299.pt (428.0MB)
93
+ 2026-01-25 00:45:01 | INFO | Best 3 checkpoints:
94
+ 2026-01-25 00:45:01 | INFO | 1. Step 299: acc=0.7355 (reward_model.best_299.pt)
95
+ 2026-01-25 00:45:01 | INFO | 2. Step 199: acc=0.7254 (reward_model.best_199.pt)
96
+ 2026-01-25 00:45:01 | INFO | 3. Step 99: acc=0.7095 (reward_model.best_99.pt)
97
+ 2026-01-25 00:45:02 | INFO | Step 300: loss=0.3717 | IF_loss=0.4516, MQ_loss=0.2917 | acc=0.844 (IF=0.854, MQ=0.833) | lr=0.000009
98
+ 2026-01-25 00:46:56 | INFO |
99
+ ============================================================
100
+ Validation Results (took 8.46s):
101
+ Samples: 346 instruction, 346 quality
102
+ Instruction Acc: 0.7081
103
+ Quality Acc: 0.7630
104
+ Average Acc: 0.7355
105
+ Total Loss: 0.6097
106
+ Instruction Loss: 0.6561
107
+ Quality Loss: 0.5632
108
+ ============================================================
109
+ 2026-01-25 00:46:56 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0038/ckpt/reward_model.best_399.pt (filtered to 38.584M trainable parameters)
110
+ 2026-01-25 00:46:56 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0038/ckpt/reward_model.best_399.pt (428.0MB)
111
+ 2026-01-25 00:46:56 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0038/ckpt/reward_model.best_99.pt
112
+ 2026-01-25 00:46:56 | INFO | Best 3 checkpoints:
113
+ 2026-01-25 00:46:56 | INFO | 1. Step 299: acc=0.7355 (reward_model.best_299.pt)
114
+ 2026-01-25 00:46:56 | INFO | 2. Step 399: acc=0.7355 (reward_model.best_399.pt)
115
+ 2026-01-25 00:46:56 | INFO | 3. Step 199: acc=0.7254 (reward_model.best_199.pt)
116
+ 2026-01-25 00:46:57 | INFO | Step 400: loss=0.5054 | IF_loss=0.5431, MQ_loss=0.4678 | acc=0.792 (IF=0.750, MQ=0.833) | lr=0.000009
117
+ 2026-01-25 00:48:50 | INFO |
118
+ ============================================================
119
+ Validation Results (took 8.33s):
120
+ Samples: 346 instruction, 346 quality
121
+ Instruction Acc: 0.7110
122
+ Quality Acc: 0.7630
123
+ Average Acc: 0.7370
124
+ Total Loss: 0.6197
125
+ Instruction Loss: 0.6728
126
+ Quality Loss: 0.5666
127
+ ============================================================
128
+ 2026-01-25 00:48:50 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0038/ckpt/reward_model.best_499.pt (filtered to 38.584M trainable parameters)
129
+ 2026-01-25 00:48:50 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0038/ckpt/reward_model.best_499.pt (428.0MB)
130
+ 2026-01-25 00:48:50 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0038/ckpt/reward_model.best_199.pt
131
+ 2026-01-25 00:48:50 | INFO | Best 3 checkpoints:
132
+ 2026-01-25 00:48:50 | INFO | 1. Step 499: acc=0.7370 (reward_model.best_499.pt)
133
+ 2026-01-25 00:48:50 | INFO | 2. Step 299: acc=0.7355 (reward_model.best_299.pt)
134
+ 2026-01-25 00:48:50 | INFO | 3. Step 399: acc=0.7355 (reward_model.best_399.pt)
135
+ 2026-01-25 00:48:51 | INFO | Step 500: loss=0.4587 | IF_loss=0.5137, MQ_loss=0.4036 | acc=0.698 (IF=0.688, MQ=0.708) | lr=0.000009
136
+ 2026-01-25 00:50:47 | INFO |
137
+ ============================================================
138
+ Validation Results (took 7.48s):
139
+ Samples: 346 instruction, 346 quality
140
+ Instruction Acc: 0.7139
141
+ Quality Acc: 0.7601
142
+ Average Acc: 0.7370
143
+ Total Loss: 0.6275
144
+ Instruction Loss: 0.6826
145
+ Quality Loss: 0.5724
146
+ ============================================================
147
+ 2026-01-25 00:50:47 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0038/ckpt/reward_model.best_599.pt (filtered to 38.584M trainable parameters)
148
+ 2026-01-25 00:50:48 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0038/ckpt/reward_model.best_599.pt (428.0MB)
149
+ 2026-01-25 00:50:48 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0038/ckpt/reward_model.best_399.pt
150
+ 2026-01-25 00:50:48 | INFO | Best 3 checkpoints:
151
+ 2026-01-25 00:50:48 | INFO | 1. Step 499: acc=0.7370 (reward_model.best_499.pt)
152
+ 2026-01-25 00:50:48 | INFO | 2. Step 599: acc=0.7370 (reward_model.best_599.pt)
153
+ 2026-01-25 00:50:48 | INFO | 3. Step 299: acc=0.7355 (reward_model.best_299.pt)
154
+ 2026-01-25 00:50:49 | INFO | Step 600: loss=0.4492 | IF_loss=0.4779, MQ_loss=0.4205 | acc=0.833 (IF=0.792, MQ=0.875) | lr=0.000008
155
+ 2026-01-25 00:52:39 | INFO |
156
+ ============================================================
157
+ Validation Results (took 7.99s):
158
+ Samples: 346 instruction, 346 quality
159
+ Instruction Acc: 0.6994
160
+ Quality Acc: 0.7659
161
+ Average Acc: 0.7327
162
+ Total Loss: 0.6397
163
+ Instruction Loss: 0.7005
164
+ Quality Loss: 0.5788
165
+ ============================================================
166
+ 2026-01-25 00:52:39 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0038/ckpt/reward_model.best_699.pt (filtered to 38.584M trainable parameters)
167
+ 2026-01-25 00:52:40 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0038/ckpt/reward_model.best_699.pt (428.0MB)
168
+ 2026-01-25 00:52:40 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0038/ckpt/reward_model.best_699.pt
169
+ 2026-01-25 00:52:40 | INFO | Best 3 checkpoints:
170
+ 2026-01-25 00:52:40 | INFO | 1. Step 499: acc=0.7370 (reward_model.best_499.pt)
171
+ 2026-01-25 00:52:40 | INFO | 2. Step 599: acc=0.7370 (reward_model.best_599.pt)
172
+ 2026-01-25 00:52:40 | INFO | 3. Step 299: acc=0.7355 (reward_model.best_299.pt)
173
+ 2026-01-25 00:52:41 | INFO | Step 700: loss=0.2280 | IF_loss=0.3205, MQ_loss=0.1356 | acc=0.938 (IF=0.917, MQ=0.958) | lr=0.000007
174
+ 2026-01-25 00:54:34 | INFO |
175
+ ============================================================
176
+ Validation Results (took 7.95s):
177
+ Samples: 346 instruction, 346 quality
178
+ Instruction Acc: 0.7110
179
+ Quality Acc: 0.7688
180
+ Average Acc: 0.7399
181
+ Total Loss: 0.6517
182
+ Instruction Loss: 0.7169
183
+ Quality Loss: 0.5864
184
+ ============================================================
185
+ 2026-01-25 00:54:34 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0038/ckpt/reward_model.best_799.pt (filtered to 38.584M trainable parameters)
186
+ 2026-01-25 00:54:34 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0038/ckpt/reward_model.best_799.pt (428.0MB)
187
+ 2026-01-25 00:54:34 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0038/ckpt/reward_model.best_299.pt
188
+ 2026-01-25 00:54:34 | INFO | Best 3 checkpoints:
189
+ 2026-01-25 00:54:34 | INFO | 1. Step 799: acc=0.7399 (reward_model.best_799.pt)
190
+ 2026-01-25 00:54:34 | INFO | 2. Step 499: acc=0.7370 (reward_model.best_499.pt)
191
+ 2026-01-25 00:54:34 | INFO | 3. Step 599: acc=0.7370 (reward_model.best_599.pt)
192
+ 2026-01-25 00:54:35 | INFO | Step 800: loss=0.3855 | IF_loss=0.4637, MQ_loss=0.3072 | acc=0.792 (IF=0.750, MQ=0.833) | lr=0.000007
193
+ 2026-01-25 00:56:29 | INFO |
194
+ ============================================================
195
+ Validation Results (took 7.96s):
196
+ Samples: 346 instruction, 346 quality
197
+ Instruction Acc: 0.7110
198
+ Quality Acc: 0.7572
199
+ Average Acc: 0.7341
200
+ Total Loss: 0.6621
201
+ Instruction Loss: 0.7294
202
+ Quality Loss: 0.5948
203
+ ============================================================
204
+ 2026-01-25 00:56:29 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0038/ckpt/reward_model.best_899.pt (filtered to 38.584M trainable parameters)
205
+ 2026-01-25 00:56:29 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0038/ckpt/reward_model.best_899.pt (428.0MB)
206
+ 2026-01-25 00:56:29 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260125_0038/ckpt/reward_model.best_899.pt
207
+ 2026-01-25 00:56:29 | INFO | Best 3 checkpoints:
208
+ 2026-01-25 00:56:29 | INFO | 1. Step 799: acc=0.7399 (reward_model.best_799.pt)
209
+ 2026-01-25 00:56:29 | INFO | 2. Step 499: acc=0.7370 (reward_model.best_499.pt)
210
+ 2026-01-25 00:56:29 | INFO | 3. Step 599: acc=0.7370 (reward_model.best_599.pt)
211
+ 2026-01-25 00:56:30 | INFO | Step 900: loss=0.3468 | IF_loss=0.3446, MQ_loss=0.3489 | acc=0.812 (IF=0.812, MQ=0.812) | lr=0.000006
20260125_0933/config.yaml ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ DEVICES: '0'
2
+ accelerate:
3
+ mixed_precision: bf16
4
+ basics:
5
+ random_seed: 42
6
+ save_dir: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human
7
+ dataset:
8
+ audio_dropout:
9
+ apply_to_eval: true
10
+ apply_to_ref: true
11
+ enabled: true
12
+ eval_only_on_training: true
13
+ max_duration: 1500
14
+ min_duration: 200
15
+ train_mode: start
16
+ cache_dir: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/tmp
17
+ db_path: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/backend/database.db
18
+ duration: 600.0
19
+ embedding_dir: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/supervised_embeddings
20
+ max_samples: null
21
+ max_val_samples: null
22
+ metadata_jsonl: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/CMI-Training/all_comparisons.jsonl
23
+ mode: raw_text_frozen_audio
24
+ preference_file: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/CMI-Training/human_annotations/train.json
25
+ sample_rate: 24000
26
+ val_preference_file: null
27
+ loss:
28
+ IF_ratio: 0.5
29
+ filter_ties: true
30
+ label_smoothing: 0.0
31
+ reduction: mean
32
+ model:
33
+ attention_mode: SA
34
+ attn_dropout: 0.0
35
+ category_embeddings: null
36
+ dim: 768
37
+ dim_head: 64
38
+ downsample:
39
+ configs:
40
+ conv2_4x:
41
+ factor: 4
42
+ kernel_size: 5
43
+ kind: conv*2
44
+ use_layernorm: true
45
+ conv_4x:
46
+ factor: 4
47
+ kernel_size: 5
48
+ kind: conv
49
+ stage: 1
50
+ use_layernorm: true
51
+ glu_4x:
52
+ factor: 4
53
+ kernel_size: 5
54
+ kind: gluconv*2+pw
55
+ use_layernorm: true
56
+ mean:
57
+ factor: 2
58
+ kind: mean
59
+ mean_4x:
60
+ dropout: 0.0
61
+ factor: 30
62
+ kind: mean+mlp
63
+ mlp_ratio: 2.0
64
+ none:
65
+ factor: 1
66
+ kind: none
67
+ eval: mean_4x
68
+ ref: null
69
+ text: none
70
+ ff_dropout: 0.0
71
+ ff_mult: 4
72
+ freeze_audio: true
73
+ freeze_text: true
74
+ gradient_checkpointing: false
75
+ heads: 8
76
+ joint_tf_depth: 1
77
+ load_config:
78
+ checkpoint_path: null
79
+ frozen_from_pretrained: true
80
+ pretrained_name: OpenMuQ/MuQ-MuLan-large
81
+ strict: false
82
+ mlp_dim: 768
83
+ mode: concat_text_late
84
+ model_name: OpenMuQ/MuQ-MuLan-large
85
+ name: reward
86
+ no_condition: false
87
+ null_embedding:
88
+ audio:
89
+ dropout: 0.5
90
+ length: 10
91
+ lyrics:
92
+ dropout: 0.3
93
+ length: 10
94
+ text:
95
+ dropout: 0
96
+ length: 10
97
+ output_dim: 2
98
+ prompt_tf_depth: 4
99
+ sr: 24000
100
+ text_encoder:
101
+ name: muq_mulan
102
+ tune: null
103
+ text_lora_config: null
104
+ train_muq_depth: 0
105
+ train_muqmulan: false
106
+ use_audio: true
107
+ use_layer_idx: -1
108
+ project_root: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena
109
+ run_name: null
110
+ train:
111
+ batch_size: 48
112
+ betas:
113
+ - 0.9
114
+ - 0.99
115
+ ema_decay: 0.9999
116
+ ema_update_every: 1
117
+ enable_gradient_checkpointing: true
118
+ force_clear_prev_results: false
119
+ grad_accum_every: 1
120
+ log_tensorboard: true
121
+ lr_schedule:
122
+ min_lr_ratio: 0.001
123
+ name: linear_cosine
124
+ total_steps: 2000
125
+ warmup_steps: 10
126
+ max_grad_norm: 1
127
+ mlp_lr: 1.0e-05
128
+ num_train_steps: 2000
129
+ num_valid_batches: null
130
+ num_workers: 8
131
+ other_lr: 1.0e-05
132
+ resume: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_0147/ckpt/reward_model.best_29999.pt
133
+ resume_optimizer: false
134
+ save_model_every: 2000
135
+ use_checkpoint_config: true
136
+ use_ema: false
137
+ use_lion: false
138
+ valid_batch_size: 20
139
+ valid_every: 100
140
+ valid_frac: 0.1
141
+ verify_weights_on_load: true
142
+ validate_only: false
20260125_0933/reward_model/1769304848.6545663/events.out.tfevents.1769304848.MACLAB-S004.1519845.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c582683cea0697f98b6b4b9e504078b8949e1df961163c7183bb40829fde464
3
+ size 503
20260125_0933/reward_model/1769304848.6563416/hparams.yml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ batch_size: 48
2
+ grad_accum_every: 1
3
+ learning_rate: 1.0e-05
4
+ num_train_steps: 2000
20260125_0933/reward_model/events.out.tfevents.1769304848.MACLAB-S004.1519845.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26a03747b7fe4dfb03f91816a786eee4cedec85474701f7636e7363c8f5ad76e
3
+ size 873949
20260125_0933/train.20260125_0933.log ADDED
@@ -0,0 +1,564 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-25 09:33:55 | INFO | Log file: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/train.20260125_0933.log
2
+ 2026-01-25 09:33:55 | INFO | Config: DEVICES: '0'
3
+ accelerate:
4
+ mixed_precision: bf16
5
+ basics:
6
+ random_seed: 42
7
+ save_dir: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human
8
+ dataset:
9
+ audio_dropout:
10
+ apply_to_eval: true
11
+ apply_to_ref: true
12
+ enabled: true
13
+ eval_only_on_training: true
14
+ max_duration: 1500
15
+ min_duration: 200
16
+ train_mode: start
17
+ cache_dir: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/tmp
18
+ db_path: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/backend/database.db
19
+ duration: 600.0
20
+ embedding_dir: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/supervised_embeddings
21
+ mode: raw_text_frozen_audio
22
+ max_samples: null
23
+ max_val_samples: null
24
+ metadata_jsonl: ${project_root}/CMI-Training/all_comparisons.jsonl
25
+ preference_file: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/CMI-Training/human_annotations/train.json
26
+ sample_rate: 24000
27
+ val_preference_file: null
28
+ loss:
29
+ IF_ratio: 0.5
30
+ filter_ties: true
31
+ label_smoothing: 0.0
32
+ reduction: mean
33
+ model:
34
+ attention_mode: SA
35
+ attn_dropout: 0.0
36
+ category_embeddings: null
37
+ dim: 768
38
+ dim_head: 64
39
+ downsample:
40
+ configs:
41
+ conv2_4x:
42
+ factor: 4
43
+ kernel_size: 5
44
+ kind: conv*2
45
+ use_layernorm: true
46
+ conv_4x:
47
+ factor: 4
48
+ kernel_size: 5
49
+ kind: conv
50
+ stage: 1
51
+ use_layernorm: true
52
+ glu_4x:
53
+ factor: 4
54
+ kernel_size: 5
55
+ kind: gluconv*2+pw
56
+ use_layernorm: true
57
+ mean:
58
+ factor: 2
59
+ kind: mean
60
+ mean_4x:
61
+ dropout: 0.0
62
+ factor: 30
63
+ kind: mean+mlp
64
+ mlp_ratio: 2.0
65
+ none:
66
+ factor: 1
67
+ kind: none
68
+ eval: mean_4x
69
+ ref: null
70
+ text: none
71
+ ff_dropout: 0.0
72
+ ff_mult: 4
73
+ freeze_audio: true
74
+ freeze_text: true
75
+ gradient_checkpointing: false
76
+ heads: 8
77
+ joint_tf_depth: 1
78
+ load_config:
79
+ checkpoint_path: null
80
+ frozen_from_pretrained: true
81
+ pretrained_name: OpenMuQ/MuQ-MuLan-large
82
+ strict: false
83
+ mlp_dim: 768
84
+ mode: concat_text_late
85
+ model_name: OpenMuQ/MuQ-MuLan-large
86
+ name: reward
87
+ null_embedding:
88
+ audio:
89
+ dropout: 0.5
90
+ length: 10
91
+ lyrics:
92
+ dropout: 0.3
93
+ length: 10
94
+ text:
95
+ dropout: 0
96
+ length: 10
97
+ output_dim: 2
98
+ prompt_tf_depth: 4
99
+ sr: 24000
100
+ text_encoder:
101
+ name: muq_mulan
102
+ tune: null
103
+ text_lora_config: null
104
+ train_muq_depth: 0
105
+ train_muqmulan: false
106
+ use_layer_idx: -1
107
+ use_audio: true
108
+ no_condition: false
109
+ project_root: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena
110
+ run_name: null
111
+ train:
112
+ batch_size: 48
113
+ betas:
114
+ - 0.9
115
+ - 0.99
116
+ ema_decay: 0.9999
117
+ ema_update_every: 1
118
+ enable_gradient_checkpointing: true
119
+ force_clear_prev_results: false
120
+ grad_accum_every: 1
121
+ log_tensorboard: true
122
+ lr_schedule:
123
+ min_lr_ratio: 0.001
124
+ name: linear_cosine
125
+ total_steps: 2000
126
+ warmup_steps: 10
127
+ max_grad_norm: 1
128
+ mlp_lr: 1.0e-05
129
+ num_train_steps: 2000
130
+ num_valid_batches: null
131
+ num_workers: 8
132
+ other_lr: 1.0e-05
133
+ resume: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_0147/ckpt/reward_model.best_29999.pt
134
+ resume_optimizer: false
135
+ save_model_every: 2000
136
+ use_checkpoint_config: true
137
+ use_ema: false
138
+ use_lion: false
139
+ valid_batch_size: 20
140
+ valid_every: 100
141
+ valid_frac: 0.1
142
+ verify_weights_on_load: true
143
+ validate_only: false
144
+
145
+ 2026-01-25 09:33:55 | INFO | Random seed set to 42
146
+ 2026-01-25 09:33:56 | INFO | Created RawTextFrozenAudioDataset with 3463 samples
147
+ 2026-01-25 09:33:56 | INFO | Split dataset into train (3117) and validation (346) sets (ratio: 10.00%)
148
+ 2026-01-25 09:33:56 | INFO | Will resume from checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_0147/ckpt/reward_model.best_29999.pt
149
+ 2026-01-25 09:33:56 | INFO | Using checkpoint config for model initialization (continue training mode)
150
+ 2026-01-25 09:34:06 | INFO | Created RewardAttentionModel with attention_mode=SA
151
+ 2026-01-25 09:34:06 | INFO | Created PreferenceLoss with filter_ties=True
152
+ 2026-01-25 09:34:06 | INFO | ✓ Gradient checkpointing enabled
153
+ 2026-01-25 09:34:06 | INFO | ✓ Audio cropping enabled: min=200, max=1500
154
+ 2026-01-25 09:34:06 | INFO | Apply to eval: True, ref: True
155
+ 2026-01-25 09:34:06 | INFO | Modes: train=random, val=start
156
+ 2026-01-25 09:34:06 | INFO | MLP head parameters: 1,186,563 params, lr=1e-05
157
+ 2026-01-25 09:34:06 | INFO | Other parameters: 37,397,634 params, lr=1e-05
158
+ 2026-01-25 09:34:06 | INFO | Using lr_schedule=linear_cosine warmup_steps=10 total_steps=2000
159
+ 2026-01-25 09:34:06 | INFO | Training with fixed validation set
160
+ 2026-01-25 09:34:06 | INFO | Train batch_size: 48, Valid batch_size: 20
161
+ 2026-01-25 09:34:08 | INFO | Missing keys (782): ['text_module.model.embeddings.word_embeddings.weight', 'text_module.model.embeddings.position_embeddings.weight', 'text_module.model.embeddings.token_type_embeddings.weight', 'text_module.model.embeddings.LayerNorm.weight', 'text_module.model.embeddings.LayerNorm.bias']...
162
+ 2026-01-25 09:34:08 | INFO | ✓ Starting from step 0 (transfer learning mode, ignoring checkpoint steps=29999)
163
+ 2026-01-25 09:34:08 | INFO | Resumed from /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260124_0147/ckpt/reward_model.best_29999.pt
164
+ 2026-01-25 09:34:08 | INFO | Parameters: 701.162M total, 38.584M trainable
165
+ 2026-01-25 09:34:08 | INFO | Text encoder (frozen): 328.389M
166
+ 2026-01-25 09:34:08 | INFO | Audio encoder (frozen): 334.189M
167
+ 2026-01-25 09:34:08 | INFO | Other trainable: 38.584M
168
+ 2026-01-25 09:34:08 | INFO | ℹ No LoRA configuration detected
169
+ 2026-01-25 09:34:08 | INFO | ============================================================
170
+ 2026-01-25 09:34:08 | INFO | Ready to start training
171
+ 2026-01-25 09:34:08 | INFO | ============================================================
172
+ 2026-01-25 09:34:08 | INFO | Starting training from step 0
173
+ 2026-01-25 09:34:08 | INFO | ===== Accelerator / CUDA Debug Info =====
174
+ 2026-01-25 09:34:08 | INFO | accelerator.device = cuda
175
+ 2026-01-25 09:34:08 | INFO | mixed_precision = bf16
176
+ 2026-01-25 09:34:08 | INFO | distributed_type = NO
177
+ 2026-01-25 09:34:08 | INFO | num_processes = 1
178
+ 2026-01-25 09:34:08 | INFO | process_index = 0
179
+ 2026-01-25 09:34:08 | INFO | is_main_process = True
180
+ 2026-01-25 09:34:08 | INFO | torch.cuda.is_available() = True
181
+ 2026-01-25 09:34:08 | INFO | torch.cuda.device_count() = 1
182
+ 2026-01-25 09:34:08 | INFO | current_device = 0
183
+ 2026-01-25 09:34:08 | INFO | device_name = NVIDIA GeForce RTX 4090
184
+ 2026-01-25 09:34:08 | INFO | model parameter device = cuda:0
185
+ 2026-01-25 09:34:08 | INFO | Training for 2000.0 steps (~32 epochs, 64 steps/epoch)
186
+ 2026-01-25 09:34:17 | INFO | Step 0: loss=1.7986 | IF_loss=2.3230, MQ_loss=1.2743 | acc=0.750 (IF=0.688, MQ=0.812) | lr=0.000002
187
+ 2026-01-25 09:34:17 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.0.pt (filtered to 38.584M trainable parameters)
188
+ 2026-01-25 09:34:17 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.0.pt (428.0MB)
189
+ 2026-01-25 09:34:17 | INFO | Step 0: Saved to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.0.pt
190
+ 2026-01-25 09:35:59 | INFO |
191
+ ============================================================
192
+ Validation Results (took 7.82s):
193
+ Samples: 346 instruction, 346 quality
194
+ Instruction Acc: 0.7110
195
+ Quality Acc: 0.6879
196
+ Average Acc: 0.6994
197
+ Total Loss: 1.2359
198
+ Instruction Loss: 1.2306
199
+ Quality Loss: 1.2412
200
+ ============================================================
201
+ 2026-01-25 09:35:59 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_99.pt (filtered to 38.584M trainable parameters)
202
+ 2026-01-25 09:35:59 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_99.pt (428.0MB)
203
+ 2026-01-25 09:35:59 | INFO | Best 1 checkpoints:
204
+ 2026-01-25 09:35:59 | INFO | 1. Step 99: acc=0.6994 (reward_model.best_99.pt)
205
+ 2026-01-25 09:36:00 | INFO | Step 100: loss=1.0465 | IF_loss=0.8500, MQ_loss=1.2430 | acc=0.688 (IF=0.708, MQ=0.667) | lr=0.000010
206
+ 2026-01-25 09:37:40 | INFO |
207
+ ============================================================
208
+ Validation Results (took 6.16s):
209
+ Samples: 346 instruction, 346 quality
210
+ Instruction Acc: 0.6994
211
+ Quality Acc: 0.7370
212
+ Average Acc: 0.7182
213
+ Total Loss: 0.7219
214
+ Instruction Loss: 0.7455
215
+ Quality Loss: 0.6983
216
+ ============================================================
217
+ 2026-01-25 09:37:40 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_199.pt (filtered to 38.584M trainable parameters)
218
+ 2026-01-25 09:37:41 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_199.pt (428.0MB)
219
+ 2026-01-25 09:37:41 | INFO | Best 2 checkpoints:
220
+ 2026-01-25 09:37:41 | INFO | 1. Step 199: acc=0.7182 (reward_model.best_199.pt)
221
+ 2026-01-25 09:37:41 | INFO | 2. Step 99: acc=0.6994 (reward_model.best_99.pt)
222
+ 2026-01-25 09:37:42 | INFO | Step 200: loss=0.3606 | IF_loss=0.3367, MQ_loss=0.3845 | acc=0.823 (IF=0.833, MQ=0.812) | lr=0.000010
223
+ 2026-01-25 09:39:20 | INFO |
224
+ ============================================================
225
+ Validation Results (took 6.75s):
226
+ Samples: 346 instruction, 346 quality
227
+ Instruction Acc: 0.7110
228
+ Quality Acc: 0.7572
229
+ Average Acc: 0.7341
230
+ Total Loss: 0.6270
231
+ Instruction Loss: 0.6676
232
+ Quality Loss: 0.5865
233
+ ============================================================
234
+ 2026-01-25 09:39:20 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_299.pt (filtered to 38.584M trainable parameters)
235
+ 2026-01-25 09:39:20 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_299.pt (428.0MB)
236
+ 2026-01-25 09:39:20 | INFO | Best 3 checkpoints:
237
+ 2026-01-25 09:39:20 | INFO | 1. Step 299: acc=0.7341 (reward_model.best_299.pt)
238
+ 2026-01-25 09:39:20 | INFO | 2. Step 199: acc=0.7182 (reward_model.best_199.pt)
239
+ 2026-01-25 09:39:20 | INFO | 3. Step 99: acc=0.6994 (reward_model.best_99.pt)
240
+ 2026-01-25 09:39:21 | INFO | Step 300: loss=0.3793 | IF_loss=0.4554, MQ_loss=0.3032 | acc=0.844 (IF=0.833, MQ=0.854) | lr=0.000009
241
+ 2026-01-25 09:41:04 | INFO |
242
+ ============================================================
243
+ Validation Results (took 6.57s):
244
+ Samples: 346 instruction, 346 quality
245
+ Instruction Acc: 0.7110
246
+ Quality Acc: 0.7486
247
+ Average Acc: 0.7298
248
+ Total Loss: 0.6179
249
+ Instruction Loss: 0.6620
250
+ Quality Loss: 0.5737
251
+ ============================================================
252
+ 2026-01-25 09:41:04 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_399.pt (filtered to 38.584M trainable parameters)
253
+ 2026-01-25 09:41:04 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_399.pt (428.0MB)
254
+ 2026-01-25 09:41:04 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_99.pt
255
+ 2026-01-25 09:41:04 | INFO | Best 3 checkpoints:
256
+ 2026-01-25 09:41:04 | INFO | 1. Step 299: acc=0.7341 (reward_model.best_299.pt)
257
+ 2026-01-25 09:41:04 | INFO | 2. Step 399: acc=0.7298 (reward_model.best_399.pt)
258
+ 2026-01-25 09:41:04 | INFO | 3. Step 199: acc=0.7182 (reward_model.best_199.pt)
259
+ 2026-01-25 09:41:05 | INFO | Step 400: loss=0.4959 | IF_loss=0.5285, MQ_loss=0.4633 | acc=0.812 (IF=0.792, MQ=0.833) | lr=0.000009
260
+ 2026-01-25 09:42:46 | INFO |
261
+ ============================================================
262
+ Validation Results (took 7.13s):
263
+ Samples: 346 instruction, 346 quality
264
+ Instruction Acc: 0.7023
265
+ Quality Acc: 0.7601
266
+ Average Acc: 0.7312
267
+ Total Loss: 0.6337
268
+ Instruction Loss: 0.6835
269
+ Quality Loss: 0.5838
270
+ ============================================================
271
+ 2026-01-25 09:42:46 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_499.pt (filtered to 38.584M trainable parameters)
272
+ 2026-01-25 09:42:46 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_499.pt (428.0MB)
273
+ 2026-01-25 09:42:46 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_199.pt
274
+ 2026-01-25 09:42:46 | INFO | Best 3 checkpoints:
275
+ 2026-01-25 09:42:46 | INFO | 1. Step 299: acc=0.7341 (reward_model.best_299.pt)
276
+ 2026-01-25 09:42:46 | INFO | 2. Step 499: acc=0.7312 (reward_model.best_499.pt)
277
+ 2026-01-25 09:42:46 | INFO | 3. Step 399: acc=0.7298 (reward_model.best_399.pt)
278
+ 2026-01-25 09:42:47 | INFO | Step 500: loss=0.4449 | IF_loss=0.5034, MQ_loss=0.3864 | acc=0.698 (IF=0.667, MQ=0.729) | lr=0.000009
279
+ 2026-01-25 09:44:34 | INFO |
280
+ ============================================================
281
+ Validation Results (took 6.93s):
282
+ Samples: 346 instruction, 346 quality
283
+ Instruction Acc: 0.7139
284
+ Quality Acc: 0.7601
285
+ Average Acc: 0.7370
286
+ Total Loss: 0.6450
287
+ Instruction Loss: 0.6969
288
+ Quality Loss: 0.5930
289
+ ============================================================
290
+ 2026-01-25 09:44:34 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_599.pt (filtered to 38.584M trainable parameters)
291
+ 2026-01-25 09:44:34 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_599.pt (428.0MB)
292
+ 2026-01-25 09:44:34 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_399.pt
293
+ 2026-01-25 09:44:34 | INFO | Best 3 checkpoints:
294
+ 2026-01-25 09:44:34 | INFO | 1. Step 599: acc=0.7370 (reward_model.best_599.pt)
295
+ 2026-01-25 09:44:34 | INFO | 2. Step 299: acc=0.7341 (reward_model.best_299.pt)
296
+ 2026-01-25 09:44:34 | INFO | 3. Step 499: acc=0.7312 (reward_model.best_499.pt)
297
+ 2026-01-25 09:44:35 | INFO | Step 600: loss=0.4510 | IF_loss=0.4687, MQ_loss=0.4333 | acc=0.812 (IF=0.792, MQ=0.833) | lr=0.000008
298
+ 2026-01-25 09:46:19 | INFO |
299
+ ============================================================
300
+ Validation Results (took 7.72s):
301
+ Samples: 346 instruction, 346 quality
302
+ Instruction Acc: 0.7139
303
+ Quality Acc: 0.7688
304
+ Average Acc: 0.7413
305
+ Total Loss: 0.6580
306
+ Instruction Loss: 0.7161
307
+ Quality Loss: 0.5999
308
+ ============================================================
309
+ 2026-01-25 09:46:19 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_699.pt (filtered to 38.584M trainable parameters)
310
+ 2026-01-25 09:46:19 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_699.pt (428.0MB)
311
+ 2026-01-25 09:46:19 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_499.pt
312
+ 2026-01-25 09:46:19 | INFO | Best 3 checkpoints:
313
+ 2026-01-25 09:46:19 | INFO | 1. Step 699: acc=0.7413 (reward_model.best_699.pt)
314
+ 2026-01-25 09:46:19 | INFO | 2. Step 599: acc=0.7370 (reward_model.best_599.pt)
315
+ 2026-01-25 09:46:19 | INFO | 3. Step 299: acc=0.7341 (reward_model.best_299.pt)
316
+ 2026-01-25 09:46:20 | INFO | Step 700: loss=0.2300 | IF_loss=0.3156, MQ_loss=0.1444 | acc=0.906 (IF=0.896, MQ=0.917) | lr=0.000007
317
+ 2026-01-25 09:48:06 | INFO |
318
+ ============================================================
319
+ Validation Results (took 6.72s):
320
+ Samples: 346 instruction, 346 quality
321
+ Instruction Acc: 0.7081
322
+ Quality Acc: 0.7630
323
+ Average Acc: 0.7355
324
+ Total Loss: 0.6754
325
+ Instruction Loss: 0.7388
326
+ Quality Loss: 0.6120
327
+ ============================================================
328
+ 2026-01-25 09:48:07 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_799.pt (filtered to 38.584M trainable parameters)
329
+ 2026-01-25 09:48:07 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_799.pt (428.0MB)
330
+ 2026-01-25 09:48:07 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_299.pt
331
+ 2026-01-25 09:48:07 | INFO | Best 3 checkpoints:
332
+ 2026-01-25 09:48:07 | INFO | 1. Step 699: acc=0.7413 (reward_model.best_699.pt)
333
+ 2026-01-25 09:48:07 | INFO | 2. Step 599: acc=0.7370 (reward_model.best_599.pt)
334
+ 2026-01-25 09:48:07 | INFO | 3. Step 799: acc=0.7355 (reward_model.best_799.pt)
335
+ 2026-01-25 09:48:08 | INFO | Step 800: loss=0.3552 | IF_loss=0.4192, MQ_loss=0.2911 | acc=0.844 (IF=0.833, MQ=0.854) | lr=0.000007
336
+ 2026-01-25 09:49:55 | INFO |
337
+ ============================================================
338
+ Validation Results (took 6.63s):
339
+ Samples: 346 instruction, 346 quality
340
+ Instruction Acc: 0.7081
341
+ Quality Acc: 0.7630
342
+ Average Acc: 0.7355
343
+ Total Loss: 0.6859
344
+ Instruction Loss: 0.7508
345
+ Quality Loss: 0.6209
346
+ ============================================================
347
+ 2026-01-25 09:49:55 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_899.pt (filtered to 38.584M trainable parameters)
348
+ 2026-01-25 09:49:55 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_899.pt (428.0MB)
349
+ 2026-01-25 09:49:55 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_899.pt
350
+ 2026-01-25 09:49:55 | INFO | Best 3 checkpoints:
351
+ 2026-01-25 09:49:55 | INFO | 1. Step 699: acc=0.7413 (reward_model.best_699.pt)
352
+ 2026-01-25 09:49:55 | INFO | 2. Step 599: acc=0.7370 (reward_model.best_599.pt)
353
+ 2026-01-25 09:49:55 | INFO | 3. Step 799: acc=0.7355 (reward_model.best_799.pt)
354
+ 2026-01-25 09:49:56 | INFO | Step 900: loss=0.3278 | IF_loss=0.3222, MQ_loss=0.3335 | acc=0.865 (IF=0.875, MQ=0.854) | lr=0.000006
355
+ 2026-01-25 09:51:41 | INFO |
356
+ ============================================================
357
+ Validation Results (took 6.87s):
358
+ Samples: 346 instruction, 346 quality
359
+ Instruction Acc: 0.7081
360
+ Quality Acc: 0.7630
361
+ Average Acc: 0.7355
362
+ Total Loss: 0.7067
363
+ Instruction Loss: 0.7755
364
+ Quality Loss: 0.6378
365
+ ============================================================
366
+ 2026-01-25 09:51:41 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_999.pt (filtered to 38.584M trainable parameters)
367
+ 2026-01-25 09:51:41 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_999.pt (428.0MB)
368
+ 2026-01-25 09:51:41 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_999.pt
369
+ 2026-01-25 09:51:41 | INFO | Best 3 checkpoints:
370
+ 2026-01-25 09:51:41 | INFO | 1. Step 699: acc=0.7413 (reward_model.best_699.pt)
371
+ 2026-01-25 09:51:41 | INFO | 2. Step 599: acc=0.7370 (reward_model.best_599.pt)
372
+ 2026-01-25 09:51:41 | INFO | 3. Step 799: acc=0.7355 (reward_model.best_799.pt)
373
+ 2026-01-25 09:51:42 | INFO | Step 1000: loss=0.2557 | IF_loss=0.2447, MQ_loss=0.2666 | acc=0.896 (IF=0.938, MQ=0.854) | lr=0.000005
374
+ 2026-01-25 09:53:32 | INFO |
375
+ ============================================================
376
+ Validation Results (took 7.21s):
377
+ Samples: 346 instruction, 346 quality
378
+ Instruction Acc: 0.7110
379
+ Quality Acc: 0.7659
380
+ Average Acc: 0.7384
381
+ Total Loss: 0.7211
382
+ Instruction Loss: 0.7922
383
+ Quality Loss: 0.6501
384
+ ============================================================
385
+ 2026-01-25 09:53:32 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_1099.pt (filtered to 38.584M trainable parameters)
386
+ 2026-01-25 09:53:32 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_1099.pt (428.0MB)
387
+ 2026-01-25 09:53:32 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_799.pt
388
+ 2026-01-25 09:53:32 | INFO | Best 3 checkpoints:
389
+ 2026-01-25 09:53:32 | INFO | 1. Step 699: acc=0.7413 (reward_model.best_699.pt)
390
+ 2026-01-25 09:53:32 | INFO | 2. Step 1099: acc=0.7384 (reward_model.best_1099.pt)
391
+ 2026-01-25 09:53:32 | INFO | 3. Step 599: acc=0.7370 (reward_model.best_599.pt)
392
+ 2026-01-25 09:53:33 | INFO | Step 1100: loss=0.2468 | IF_loss=0.2882, MQ_loss=0.2053 | acc=0.875 (IF=0.875, MQ=0.875) | lr=0.000004
393
+ 2026-01-25 09:55:16 | INFO |
394
+ ============================================================
395
+ Validation Results (took 7.04s):
396
+ Samples: 346 instruction, 346 quality
397
+ Instruction Acc: 0.7081
398
+ Quality Acc: 0.7688
399
+ Average Acc: 0.7384
400
+ Total Loss: 0.7338
401
+ Instruction Loss: 0.8081
402
+ Quality Loss: 0.6596
403
+ ============================================================
404
+ 2026-01-25 09:55:16 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_1199.pt (filtered to 38.584M trainable parameters)
405
+ 2026-01-25 09:55:17 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_1199.pt (428.0MB)
406
+ 2026-01-25 09:55:17 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_599.pt
407
+ 2026-01-25 09:55:17 | INFO | Best 3 checkpoints:
408
+ 2026-01-25 09:55:17 | INFO | 1. Step 699: acc=0.7413 (reward_model.best_699.pt)
409
+ 2026-01-25 09:55:17 | INFO | 2. Step 1099: acc=0.7384 (reward_model.best_1099.pt)
410
+ 2026-01-25 09:55:17 | INFO | 3. Step 1199: acc=0.7384 (reward_model.best_1199.pt)
411
+ 2026-01-25 09:55:18 | INFO | Step 1200: loss=0.2555 | IF_loss=0.3150, MQ_loss=0.1960 | acc=0.833 (IF=0.812, MQ=0.854) | lr=0.000003
412
+ 2026-01-25 09:57:06 | INFO |
413
+ ============================================================
414
+ Validation Results (took 6.87s):
415
+ Samples: 346 instruction, 346 quality
416
+ Instruction Acc: 0.7081
417
+ Quality Acc: 0.7659
418
+ Average Acc: 0.7370
419
+ Total Loss: 0.7409
420
+ Instruction Loss: 0.8178
421
+ Quality Loss: 0.6641
422
+ ============================================================
423
+ 2026-01-25 09:57:06 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_1299.pt (filtered to 38.584M trainable parameters)
424
+ 2026-01-25 09:57:07 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_1299.pt (428.0MB)
425
+ 2026-01-25 09:57:07 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_1299.pt
426
+ 2026-01-25 09:57:07 | INFO | Best 3 checkpoints:
427
+ 2026-01-25 09:57:07 | INFO | 1. Step 699: acc=0.7413 (reward_model.best_699.pt)
428
+ 2026-01-25 09:57:07 | INFO | 2. Step 1099: acc=0.7384 (reward_model.best_1099.pt)
429
+ 2026-01-25 09:57:07 | INFO | 3. Step 1199: acc=0.7384 (reward_model.best_1199.pt)
430
+ 2026-01-25 09:57:08 | INFO | Step 1300: loss=0.3035 | IF_loss=0.2872, MQ_loss=0.3198 | acc=0.865 (IF=0.854, MQ=0.875) | lr=0.000003
431
+ 2026-01-25 09:58:54 | INFO |
432
+ ============================================================
433
+ Validation Results (took 8.06s):
434
+ Samples: 346 instruction, 346 quality
435
+ Instruction Acc: 0.7052
436
+ Quality Acc: 0.7659
437
+ Average Acc: 0.7355
438
+ Total Loss: 0.7497
439
+ Instruction Loss: 0.8259
440
+ Quality Loss: 0.6735
441
+ ============================================================
442
+ 2026-01-25 09:58:55 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_1399.pt (filtered to 38.584M trainable parameters)
443
+ 2026-01-25 09:58:55 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_1399.pt (428.0MB)
444
+ 2026-01-25 09:58:55 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_1399.pt
445
+ 2026-01-25 09:58:55 | INFO | Best 3 checkpoints:
446
+ 2026-01-25 09:58:55 | INFO | 1. Step 699: acc=0.7413 (reward_model.best_699.pt)
447
+ 2026-01-25 09:58:55 | INFO | 2. Step 1099: acc=0.7384 (reward_model.best_1099.pt)
448
+ 2026-01-25 09:58:55 | INFO | 3. Step 1199: acc=0.7384 (reward_model.best_1199.pt)
449
+ 2026-01-25 09:58:56 | INFO | Step 1400: loss=0.2354 | IF_loss=0.2780, MQ_loss=0.1928 | acc=0.917 (IF=0.896, MQ=0.938) | lr=0.000002
450
+ 2026-01-25 10:00:46 | INFO |
451
+ ============================================================
452
+ Validation Results (took 6.79s):
453
+ Samples: 346 instruction, 346 quality
454
+ Instruction Acc: 0.7052
455
+ Quality Acc: 0.7717
456
+ Average Acc: 0.7384
457
+ Total Loss: 0.7534
458
+ Instruction Loss: 0.8301
459
+ Quality Loss: 0.6767
460
+ ============================================================
461
+ 2026-01-25 10:00:46 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_1499.pt (filtered to 38.584M trainable parameters)
462
+ 2026-01-25 10:00:46 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_1499.pt (428.0MB)
463
+ 2026-01-25 10:00:46 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_1499.pt
464
+ 2026-01-25 10:00:46 | INFO | Best 3 checkpoints:
465
+ 2026-01-25 10:00:46 | INFO | 1. Step 699: acc=0.7413 (reward_model.best_699.pt)
466
+ 2026-01-25 10:00:46 | INFO | 2. Step 1099: acc=0.7384 (reward_model.best_1099.pt)
467
+ 2026-01-25 10:00:46 | INFO | 3. Step 1199: acc=0.7384 (reward_model.best_1199.pt)
468
+ 2026-01-25 10:00:47 | INFO | Step 1500: loss=0.2509 | IF_loss=0.2888, MQ_loss=0.2131 | acc=0.875 (IF=0.875, MQ=0.875) | lr=0.000001
469
+ 2026-01-25 10:02:32 | INFO |
470
+ ============================================================
471
+ Validation Results (took 7.11s):
472
+ Samples: 346 instruction, 346 quality
473
+ Instruction Acc: 0.7081
474
+ Quality Acc: 0.7717
475
+ Average Acc: 0.7399
476
+ Total Loss: 0.7576
477
+ Instruction Loss: 0.8358
478
+ Quality Loss: 0.6793
479
+ ============================================================
480
+ 2026-01-25 10:02:32 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_1599.pt (filtered to 38.584M trainable parameters)
481
+ 2026-01-25 10:02:33 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_1599.pt (428.0MB)
482
+ 2026-01-25 10:02:33 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_1199.pt
483
+ 2026-01-25 10:02:33 | INFO | Best 3 checkpoints:
484
+ 2026-01-25 10:02:33 | INFO | 1. Step 699: acc=0.7413 (reward_model.best_699.pt)
485
+ 2026-01-25 10:02:33 | INFO | 2. Step 1599: acc=0.7399 (reward_model.best_1599.pt)
486
+ 2026-01-25 10:02:33 | INFO | 3. Step 1099: acc=0.7384 (reward_model.best_1099.pt)
487
+ 2026-01-25 10:02:38 | INFO | Step 1600: loss=0.1956 | IF_loss=0.2453, MQ_loss=0.1458 | acc=0.938 (IF=0.896, MQ=0.979) | lr=0.000001
488
+ 2026-01-25 10:04:22 | INFO |
489
+ ============================================================
490
+ Validation Results (took 7.02s):
491
+ Samples: 346 instruction, 346 quality
492
+ Instruction Acc: 0.7023
493
+ Quality Acc: 0.7688
494
+ Average Acc: 0.7355
495
+ Total Loss: 0.7592
496
+ Instruction Loss: 0.8378
497
+ Quality Loss: 0.6806
498
+ ============================================================
499
+ 2026-01-25 10:04:22 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_1699.pt (filtered to 38.584M trainable parameters)
500
+ 2026-01-25 10:04:22 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_1699.pt (428.0MB)
501
+ 2026-01-25 10:04:22 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_1699.pt
502
+ 2026-01-25 10:04:22 | INFO | Best 3 checkpoints:
503
+ 2026-01-25 10:04:22 | INFO | 1. Step 699: acc=0.7413 (reward_model.best_699.pt)
504
+ 2026-01-25 10:04:22 | INFO | 2. Step 1599: acc=0.7399 (reward_model.best_1599.pt)
505
+ 2026-01-25 10:04:22 | INFO | 3. Step 1099: acc=0.7384 (reward_model.best_1099.pt)
506
+ 2026-01-25 10:04:23 | INFO | Step 1700: loss=0.3023 | IF_loss=0.2025, MQ_loss=0.4021 | acc=0.854 (IF=0.917, MQ=0.792) | lr=0.000001
507
+ 2026-01-25 10:06:16 | INFO |
508
+ ============================================================
509
+ Validation Results (took 7.44s):
510
+ Samples: 346 instruction, 346 quality
511
+ Instruction Acc: 0.7023
512
+ Quality Acc: 0.7659
513
+ Average Acc: 0.7341
514
+ Total Loss: 0.7613
515
+ Instruction Loss: 0.8400
516
+ Quality Loss: 0.6826
517
+ ============================================================
518
+ 2026-01-25 10:06:16 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_1799.pt (filtered to 38.584M trainable parameters)
519
+ 2026-01-25 10:06:16 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_1799.pt (428.0MB)
520
+ 2026-01-25 10:06:16 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_1799.pt
521
+ 2026-01-25 10:06:16 | INFO | Best 3 checkpoints:
522
+ 2026-01-25 10:06:16 | INFO | 1. Step 699: acc=0.7413 (reward_model.best_699.pt)
523
+ 2026-01-25 10:06:16 | INFO | 2. Step 1599: acc=0.7399 (reward_model.best_1599.pt)
524
+ 2026-01-25 10:06:16 | INFO | 3. Step 1099: acc=0.7384 (reward_model.best_1099.pt)
525
+ 2026-01-25 10:06:17 | INFO | Step 1800: loss=0.1655 | IF_loss=0.1916, MQ_loss=0.1395 | acc=0.896 (IF=0.875, MQ=0.917) | lr=0.000000
526
+ 2026-01-25 10:08:05 | INFO |
527
+ ============================================================
528
+ Validation Results (took 7.05s):
529
+ Samples: 346 instruction, 346 quality
530
+ Instruction Acc: 0.7023
531
+ Quality Acc: 0.7717
532
+ Average Acc: 0.7370
533
+ Total Loss: 0.7619
534
+ Instruction Loss: 0.8410
535
+ Quality Loss: 0.6828
536
+ ============================================================
537
+ 2026-01-25 10:08:05 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_1899.pt (filtered to 38.584M trainable parameters)
538
+ 2026-01-25 10:08:05 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_1899.pt (428.0MB)
539
+ 2026-01-25 10:08:05 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_1899.pt
540
+ 2026-01-25 10:08:05 | INFO | Best 3 checkpoints:
541
+ 2026-01-25 10:08:05 | INFO | 1. Step 699: acc=0.7413 (reward_model.best_699.pt)
542
+ 2026-01-25 10:08:05 | INFO | 2. Step 1599: acc=0.7399 (reward_model.best_1599.pt)
543
+ 2026-01-25 10:08:05 | INFO | 3. Step 1099: acc=0.7384 (reward_model.best_1099.pt)
544
+ 2026-01-25 10:08:06 | INFO | Step 1900: loss=0.2225 | IF_loss=0.2413, MQ_loss=0.2037 | acc=0.896 (IF=0.875, MQ=0.917) | lr=0.000000
545
+ 2026-01-25 10:09:55 | INFO |
546
+ ============================================================
547
+ Validation Results (took 7.58s):
548
+ Samples: 346 instruction, 346 quality
549
+ Instruction Acc: 0.7023
550
+ Quality Acc: 0.7688
551
+ Average Acc: 0.7355
552
+ Total Loss: 0.7619
553
+ Instruction Loss: 0.8410
554
+ Quality Loss: 0.6827
555
+ ============================================================
556
+ 2026-01-25 10:09:55 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_1999.pt (filtered to 38.584M trainable parameters)
557
+ 2026-01-25 10:09:55 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_1999.pt (428.0MB)
558
+ 2026-01-25 10:09:55 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0933/ckpt/reward_model.best_1999.pt
559
+ 2026-01-25 10:09:55 | INFO | Best 3 checkpoints:
560
+ 2026-01-25 10:09:55 | INFO | 1. Step 699: acc=0.7413 (reward_model.best_699.pt)
561
+ 2026-01-25 10:09:55 | INFO | 2. Step 1599: acc=0.7399 (reward_model.best_1599.pt)
562
+ 2026-01-25 10:09:55 | INFO | 3. Step 1099: acc=0.7384 (reward_model.best_1099.pt)
563
+ 2026-01-25 10:09:55 | INFO | Training complete!
564
+ 2026-01-25 10:09:55 | INFO | Training complete!
20260125_0947_CA/config.yaml ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ DEVICES: '2'
2
+ accelerate:
3
+ mixed_precision: bf16
4
+ basics:
5
+ random_seed: 42
6
+ save_dir: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human
7
+ dataset:
8
+ audio_dropout:
9
+ apply_to_eval: true
10
+ apply_to_ref: true
11
+ enabled: true
12
+ eval_only_on_training: true
13
+ max_duration: 1500
14
+ min_duration: 200
15
+ train_mode: start
16
+ cache_dir: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/tmp
17
+ db_path: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/backend/database.db
18
+ duration: 600.0
19
+ embedding_dir: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/supervised_embeddings
20
+ max_samples: null
21
+ max_val_samples: null
22
+ metadata_jsonl: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/CMI-Training/all_comparisons.jsonl
23
+ mode: raw_text_frozen_audio
24
+ preference_file: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/CMI-Training/human_annotations/train.json
25
+ sample_rate: 24000
26
+ val_preference_file: null
27
+ loss:
28
+ IF_ratio: 0.5
29
+ filter_ties: true
30
+ label_smoothing: 0.0
31
+ reduction: mean
32
+ model:
33
+ attention_mode: SA
34
+ attn_dropout: 0.0
35
+ category_embeddings: null
36
+ dim: 768
37
+ dim_head: 64
38
+ downsample:
39
+ configs:
40
+ conv2_4x:
41
+ factor: 4
42
+ kernel_size: 5
43
+ kind: conv*2
44
+ use_layernorm: true
45
+ conv_4x:
46
+ factor: 4
47
+ kernel_size: 5
48
+ kind: conv
49
+ stage: 1
50
+ use_layernorm: true
51
+ glu_4x:
52
+ factor: 4
53
+ kernel_size: 5
54
+ kind: gluconv*2+pw
55
+ use_layernorm: true
56
+ mean:
57
+ factor: 2
58
+ kind: mean
59
+ mean_4x:
60
+ dropout: 0.0
61
+ factor: 30
62
+ kind: mean+mlp
63
+ mlp_ratio: 2.0
64
+ none:
65
+ factor: 1
66
+ kind: none
67
+ eval: mean_4x
68
+ ref: null
69
+ text: none
70
+ ff_dropout: 0.0
71
+ ff_mult: 4
72
+ freeze_audio: true
73
+ freeze_text: true
74
+ gradient_checkpointing: false
75
+ heads: 8
76
+ joint_tf_depth: 1
77
+ load_config:
78
+ checkpoint_path: null
79
+ frozen_from_pretrained: true
80
+ pretrained_name: OpenMuQ/MuQ-MuLan-large
81
+ strict: false
82
+ mlp_dim: 768
83
+ mode: concat_text_late
84
+ model_name: OpenMuQ/MuQ-MuLan-large
85
+ name: reward
86
+ no_condition: false
87
+ null_embedding:
88
+ audio:
89
+ dropout: 0.5
90
+ length: 10
91
+ lyrics:
92
+ dropout: 0.3
93
+ length: 10
94
+ text:
95
+ dropout: 0
96
+ length: 10
97
+ output_dim: 2
98
+ prompt_tf_depth: 4
99
+ sr: 24000
100
+ text_encoder:
101
+ name: muq_mulan
102
+ tune: null
103
+ text_lora_config: null
104
+ train_muq_depth: 0
105
+ train_muqmulan: false
106
+ use_audio: true
107
+ use_layer_idx: -1
108
+ project_root: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena
109
+ run_name: CA
110
+ train:
111
+ batch_size: 48
112
+ betas:
113
+ - 0.9
114
+ - 0.99
115
+ ema_decay: 0.9999
116
+ ema_update_every: 1
117
+ enable_gradient_checkpointing: true
118
+ force_clear_prev_results: false
119
+ grad_accum_every: 1
120
+ log_tensorboard: true
121
+ lr_schedule:
122
+ min_lr_ratio: 0.001
123
+ name: linear_cosine
124
+ total_steps: 2000
125
+ warmup_steps: 10
126
+ max_grad_norm: 1
127
+ mlp_lr: 1.0e-05
128
+ num_train_steps: 2000
129
+ num_valid_batches: null
130
+ num_workers: 8
131
+ other_lr: 1.0e-05
132
+ resume: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260123_1310/ckpt/reward_model.best_25999.pt
133
+ resume_optimizer: false
134
+ save_model_every: 2000
135
+ use_checkpoint_config: true
136
+ use_ema: false
137
+ use_lion: false
138
+ valid_batch_size: 20
139
+ valid_every: 100
140
+ valid_frac: 0.1
141
+ verify_weights_on_load: true
142
+ validate_only: false
20260125_0947_CA/eval_results_0125_1703.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
20260125_0947_CA/reward_model/1769305674.1033533/events.out.tfevents.1769305674.MACLAB-S004.1592070.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5491cc26f71b367dacdcda2398f1629e1eb90969ee662e1e00c9c0e40d9ce02c
3
+ size 503
20260125_0947_CA/reward_model/1769305674.1053352/hparams.yml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ batch_size: 48
2
+ grad_accum_every: 1
3
+ learning_rate: 1.0e-05
4
+ num_train_steps: 2000
20260125_0947_CA/reward_model/events.out.tfevents.1769305674.MACLAB-S004.1592070.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e55f888cb7afbd9e4d63134150e67da713c005819afe82f6f41ffd948a4993a8
3
+ size 874266
20260125_0947_CA/train.20260125_0947_CA.log ADDED
@@ -0,0 +1,438 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-25 09:47:44 | INFO | Log file: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/train.20260125_0947_CA.log
2
+ 2026-01-25 09:47:44 | INFO | Random seed set to 42
3
+ 2026-01-25 09:47:45 | INFO | Created RawTextFrozenAudioDataset with 3463 samples
4
+ 2026-01-25 09:47:45 | INFO | Split dataset into train (3117) and validation (346) sets (ratio: 10.00%)
5
+ 2026-01-25 09:47:45 | INFO | Will resume from checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260123_1310/ckpt/reward_model.best_25999.pt
6
+ 2026-01-25 09:47:45 | INFO | Using checkpoint config for model initialization (continue training mode)
7
+ 2026-01-25 09:47:52 | INFO | Created RewardAttentionModel with attention_mode=CA
8
+ 2026-01-25 09:47:52 | INFO | Created PreferenceLoss with filter_ties=True
9
+ 2026-01-25 09:47:52 | INFO | ✓ Gradient checkpointing enabled
10
+ 2026-01-25 09:47:52 | INFO | ✓ Audio cropping enabled: min=200, max=1500
11
+ 2026-01-25 09:47:52 | INFO | Apply to eval: True, ref: True
12
+ 2026-01-25 09:47:52 | INFO | Modes: train=random, val=start
13
+ 2026-01-25 09:47:52 | INFO | MLP head parameters: 1,186,563 params, lr=1e-05
14
+ 2026-01-25 09:47:52 | INFO | Other parameters: 20,092,674 params, lr=1e-05
15
+ 2026-01-25 09:47:52 | INFO | Using lr_schedule=linear_cosine warmup_steps=10 total_steps=2000
16
+ 2026-01-25 09:47:52 | INFO | Training with fixed validation set
17
+ 2026-01-25 09:47:52 | INFO | Train batch_size: 48, Valid batch_size: 20
18
+ 2026-01-25 09:47:53 | INFO | Missing keys (782): ['text_module.model.embeddings.word_embeddings.weight', 'text_module.model.embeddings.position_embeddings.weight', 'text_module.model.embeddings.token_type_embeddings.weight', 'text_module.model.embeddings.LayerNorm.weight', 'text_module.model.embeddings.LayerNorm.bias']...
19
+ 2026-01-25 09:47:53 | INFO | ✓ Starting from step 0 (transfer learning mode, ignoring checkpoint steps=25999)
20
+ 2026-01-25 09:47:53 | INFO | Resumed from /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model/20260123_1310/ckpt/reward_model.best_25999.pt
21
+ 2026-01-25 09:47:53 | INFO | Parameters: 683.857M total, 21.279M trainable
22
+ 2026-01-25 09:47:53 | INFO | Text encoder (frozen): 328.389M
23
+ 2026-01-25 09:47:53 | INFO | Audio encoder (frozen): 334.189M
24
+ 2026-01-25 09:47:53 | INFO | Other trainable: 21.279M
25
+ 2026-01-25 09:47:53 | INFO | ℹ No LoRA configuration detected
26
+ 2026-01-25 09:47:54 | INFO | ============================================================
27
+ 2026-01-25 09:47:54 | INFO | Ready to start training
28
+ 2026-01-25 09:47:54 | INFO | ============================================================
29
+ 2026-01-25 09:47:54 | INFO | Starting training from step 0
30
+ 2026-01-25 09:47:54 | INFO | ===== Accelerator / CUDA Debug Info =====
31
+ 2026-01-25 09:47:54 | INFO | accelerator.device = cuda
32
+ 2026-01-25 09:47:54 | INFO | mixed_precision = bf16
33
+ 2026-01-25 09:47:54 | INFO | distributed_type = NO
34
+ 2026-01-25 09:47:54 | INFO | num_processes = 1
35
+ 2026-01-25 09:47:54 | INFO | process_index = 0
36
+ 2026-01-25 09:47:54 | INFO | is_main_process = True
37
+ 2026-01-25 09:47:54 | INFO | torch.cuda.is_available() = True
38
+ 2026-01-25 09:47:54 | INFO | torch.cuda.device_count() = 1
39
+ 2026-01-25 09:47:54 | INFO | current_device = 0
40
+ 2026-01-25 09:47:54 | INFO | device_name = NVIDIA GeForce RTX 4090
41
+ 2026-01-25 09:47:54 | INFO | model parameter device = cuda:0
42
+ 2026-01-25 09:47:54 | INFO | Training for 2000.0 steps (~32 epochs, 64 steps/epoch)
43
+ 2026-01-25 09:47:54 | INFO |
44
+ ============================================================
45
+ 2026-01-25 09:47:54 | INFO | Running initial validation after resume...
46
+ 2026-01-25 09:47:54 | INFO | ============================================================
47
+ 2026-01-25 09:48:06 | INFO |
48
+ ============================================================
49
+ Validation Results (took 12.13s):
50
+ Samples: 346 instruction, 346 quality
51
+ Instruction Acc: 0.6503
52
+ Quality Acc: 0.6532
53
+ Average Acc: 0.6517
54
+ Total Loss: 1.2600
55
+ Instruction Loss: 1.2149
56
+ Quality Loss: 1.3051
57
+ ============================================================
58
+ 2026-01-25 09:48:06 | INFO | Initial validation complete.
59
+
60
+ 2026-01-25 09:48:12 | INFO | Step 0: loss=1.5130 | IF_loss=1.3189, MQ_loss=1.7072 | acc=0.656 (IF=0.667, MQ=0.646) | lr=0.000002
61
+ 2026-01-25 09:48:12 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.0.pt (filtered to 21.279M trainable parameters)
62
+ 2026-01-25 09:48:12 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.0.pt (229.9MB)
63
+ 2026-01-25 09:48:12 | INFO | Step 0: Saved to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.0.pt
64
+ 2026-01-25 09:50:17 | INFO |
65
+ ============================================================
66
+ Validation Results (took 8.06s):
67
+ Samples: 346 instruction, 346 quality
68
+ Instruction Acc: 0.6705
69
+ Quality Acc: 0.6965
70
+ Average Acc: 0.6835
71
+ Total Loss: 0.7808
72
+ Instruction Loss: 0.7905
73
+ Quality Loss: 0.7712
74
+ ============================================================
75
+ 2026-01-25 09:50:17 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_99.pt (filtered to 21.279M trainable parameters)
76
+ 2026-01-25 09:50:17 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_99.pt (229.9MB)
77
+ 2026-01-25 09:50:17 | INFO | Best 1 checkpoints:
78
+ 2026-01-25 09:50:17 | INFO | 1. Step 99: acc=0.6835 (reward_model.best_99.pt)
79
+ 2026-01-25 09:50:18 | INFO | Step 100: loss=0.7788 | IF_loss=0.7289, MQ_loss=0.8287 | acc=0.667 (IF=0.667, MQ=0.667) | lr=0.000010
80
+ 2026-01-25 09:52:28 | INFO |
81
+ ============================================================
82
+ Validation Results (took 9.09s):
83
+ Samples: 346 instruction, 346 quality
84
+ Instruction Acc: 0.7052
85
+ Quality Acc: 0.7370
86
+ Average Acc: 0.7211
87
+ Total Loss: 0.6198
88
+ Instruction Loss: 0.6420
89
+ Quality Loss: 0.5976
90
+ ============================================================
91
+ 2026-01-25 09:52:28 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_199.pt (filtered to 21.279M trainable parameters)
92
+ 2026-01-25 09:52:28 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_199.pt (229.9MB)
93
+ 2026-01-25 09:52:28 | INFO | Best 2 checkpoints:
94
+ 2026-01-25 09:52:28 | INFO | 1. Step 199: acc=0.7211 (reward_model.best_199.pt)
95
+ 2026-01-25 09:52:28 | INFO | 2. Step 99: acc=0.6835 (reward_model.best_99.pt)
96
+ 2026-01-25 09:52:29 | INFO | Step 200: loss=0.6449 | IF_loss=0.5494, MQ_loss=0.7404 | acc=0.646 (IF=0.646, MQ=0.646) | lr=0.000010
97
+ 2026-01-25 09:54:31 | INFO |
98
+ ============================================================
99
+ Validation Results (took 8.43s):
100
+ Samples: 346 instruction, 346 quality
101
+ Instruction Acc: 0.7052
102
+ Quality Acc: 0.7457
103
+ Average Acc: 0.7254
104
+ Total Loss: 0.5725
105
+ Instruction Loss: 0.6007
106
+ Quality Loss: 0.5443
107
+ ============================================================
108
+ 2026-01-25 09:54:31 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_299.pt (filtered to 21.279M trainable parameters)
109
+ 2026-01-25 09:54:31 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_299.pt (229.9MB)
110
+ 2026-01-25 09:54:31 | INFO | Best 3 checkpoints:
111
+ 2026-01-25 09:54:31 | INFO | 1. Step 299: acc=0.7254 (reward_model.best_299.pt)
112
+ 2026-01-25 09:54:31 | INFO | 2. Step 199: acc=0.7211 (reward_model.best_199.pt)
113
+ 2026-01-25 09:54:31 | INFO | 3. Step 99: acc=0.6835 (reward_model.best_99.pt)
114
+ 2026-01-25 09:54:33 | INFO | Step 300: loss=0.5330 | IF_loss=0.4058, MQ_loss=0.6601 | acc=0.750 (IF=0.792, MQ=0.708) | lr=0.000009
115
+ 2026-01-25 09:56:42 | INFO |
116
+ ============================================================
117
+ Validation Results (took 8.75s):
118
+ Samples: 346 instruction, 346 quality
119
+ Instruction Acc: 0.7081
120
+ Quality Acc: 0.7630
121
+ Average Acc: 0.7355
122
+ Total Loss: 0.5519
123
+ Instruction Loss: 0.5810
124
+ Quality Loss: 0.5228
125
+ ============================================================
126
+ 2026-01-25 09:56:42 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_399.pt (filtered to 21.279M trainable parameters)
127
+ 2026-01-25 09:56:42 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_399.pt (229.9MB)
128
+ 2026-01-25 09:56:42 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_99.pt
129
+ 2026-01-25 09:56:42 | INFO | Best 3 checkpoints:
130
+ 2026-01-25 09:56:42 | INFO | 1. Step 399: acc=0.7355 (reward_model.best_399.pt)
131
+ 2026-01-25 09:56:42 | INFO | 2. Step 299: acc=0.7254 (reward_model.best_299.pt)
132
+ 2026-01-25 09:56:42 | INFO | 3. Step 199: acc=0.7211 (reward_model.best_199.pt)
133
+ 2026-01-25 09:56:44 | INFO | Step 400: loss=0.5271 | IF_loss=0.4825, MQ_loss=0.5716 | acc=0.740 (IF=0.729, MQ=0.750) | lr=0.000009
134
+ 2026-01-25 09:58:49 | INFO |
135
+ ============================================================
136
+ Validation Results (took 8.57s):
137
+ Samples: 346 instruction, 346 quality
138
+ Instruction Acc: 0.7168
139
+ Quality Acc: 0.7746
140
+ Average Acc: 0.7457
141
+ Total Loss: 0.5440
142
+ Instruction Loss: 0.5747
143
+ Quality Loss: 0.5133
144
+ ============================================================
145
+ 2026-01-25 09:58:49 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_499.pt (filtered to 21.279M trainable parameters)
146
+ 2026-01-25 09:58:49 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_499.pt (229.9MB)
147
+ 2026-01-25 09:58:49 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_199.pt
148
+ 2026-01-25 09:58:49 | INFO | Best 3 checkpoints:
149
+ 2026-01-25 09:58:49 | INFO | 1. Step 499: acc=0.7457 (reward_model.best_499.pt)
150
+ 2026-01-25 09:58:49 | INFO | 2. Step 399: acc=0.7355 (reward_model.best_399.pt)
151
+ 2026-01-25 09:58:49 | INFO | 3. Step 299: acc=0.7254 (reward_model.best_299.pt)
152
+ 2026-01-25 09:58:50 | INFO | Step 500: loss=0.4747 | IF_loss=0.5236, MQ_loss=0.4259 | acc=0.708 (IF=0.688, MQ=0.729) | lr=0.000009
153
+ 2026-01-25 10:00:58 | INFO |
154
+ ============================================================
155
+ Validation Results (took 8.22s):
156
+ Samples: 346 instruction, 346 quality
157
+ Instruction Acc: 0.7197
158
+ Quality Acc: 0.7717
159
+ Average Acc: 0.7457
160
+ Total Loss: 0.5393
161
+ Instruction Loss: 0.5700
162
+ Quality Loss: 0.5086
163
+ ============================================================
164
+ 2026-01-25 10:00:58 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_599.pt (filtered to 21.279M trainable parameters)
165
+ 2026-01-25 10:00:59 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_599.pt (229.9MB)
166
+ 2026-01-25 10:00:59 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_299.pt
167
+ 2026-01-25 10:00:59 | INFO | Best 3 checkpoints:
168
+ 2026-01-25 10:00:59 | INFO | 1. Step 599: acc=0.7457 (reward_model.best_599.pt)
169
+ 2026-01-25 10:00:59 | INFO | 2. Step 499: acc=0.7457 (reward_model.best_499.pt)
170
+ 2026-01-25 10:00:59 | INFO | 3. Step 399: acc=0.7355 (reward_model.best_399.pt)
171
+ 2026-01-25 10:01:00 | INFO | Step 600: loss=0.4197 | IF_loss=0.3952, MQ_loss=0.4441 | acc=0.729 (IF=0.750, MQ=0.708) | lr=0.000008
172
+ 2026-01-25 10:03:05 | INFO |
173
+ ============================================================
174
+ Validation Results (took 8.34s):
175
+ Samples: 346 instruction, 346 quality
176
+ Instruction Acc: 0.7168
177
+ Quality Acc: 0.7803
178
+ Average Acc: 0.7486
179
+ Total Loss: 0.5377
180
+ Instruction Loss: 0.5695
181
+ Quality Loss: 0.5060
182
+ ============================================================
183
+ 2026-01-25 10:03:05 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_699.pt (filtered to 21.279M trainable parameters)
184
+ 2026-01-25 10:03:05 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_699.pt (229.9MB)
185
+ 2026-01-25 10:03:05 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_399.pt
186
+ 2026-01-25 10:03:05 | INFO | Best 3 checkpoints:
187
+ 2026-01-25 10:03:05 | INFO | 1. Step 699: acc=0.7486 (reward_model.best_699.pt)
188
+ 2026-01-25 10:03:05 | INFO | 2. Step 599: acc=0.7457 (reward_model.best_599.pt)
189
+ 2026-01-25 10:03:05 | INFO | 3. Step 499: acc=0.7457 (reward_model.best_499.pt)
190
+ 2026-01-25 10:03:07 | INFO | Step 700: loss=0.4059 | IF_loss=0.4302, MQ_loss=0.3815 | acc=0.833 (IF=0.812, MQ=0.854) | lr=0.000007
191
+ 2026-01-25 10:05:18 | INFO |
192
+ ============================================================
193
+ Validation Results (took 8.63s):
194
+ Samples: 346 instruction, 346 quality
195
+ Instruction Acc: 0.7139
196
+ Quality Acc: 0.7861
197
+ Average Acc: 0.7500
198
+ Total Loss: 0.5391
199
+ Instruction Loss: 0.5710
200
+ Quality Loss: 0.5071
201
+ ============================================================
202
+ 2026-01-25 10:05:18 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_799.pt (filtered to 21.279M trainable parameters)
203
+ 2026-01-25 10:05:18 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_799.pt (229.9MB)
204
+ 2026-01-25 10:05:18 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_499.pt
205
+ 2026-01-25 10:05:18 | INFO | Best 3 checkpoints:
206
+ 2026-01-25 10:05:18 | INFO | 1. Step 799: acc=0.7500 (reward_model.best_799.pt)
207
+ 2026-01-25 10:05:18 | INFO | 2. Step 699: acc=0.7486 (reward_model.best_699.pt)
208
+ 2026-01-25 10:05:18 | INFO | 3. Step 599: acc=0.7457 (reward_model.best_599.pt)
209
+ 2026-01-25 10:05:20 | INFO | Step 800: loss=0.4310 | IF_loss=0.5054, MQ_loss=0.3567 | acc=0.812 (IF=0.750, MQ=0.875) | lr=0.000007
210
+ 2026-01-25 10:07:31 | INFO |
211
+ ============================================================
212
+ Validation Results (took 8.70s):
213
+ Samples: 346 instruction, 346 quality
214
+ Instruction Acc: 0.7139
215
+ Quality Acc: 0.7746
216
+ Average Acc: 0.7442
217
+ Total Loss: 0.5383
218
+ Instruction Loss: 0.5699
219
+ Quality Loss: 0.5067
220
+ ============================================================
221
+ 2026-01-25 10:07:31 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_899.pt (filtered to 21.279M trainable parameters)
222
+ 2026-01-25 10:07:31 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_899.pt (229.9MB)
223
+ 2026-01-25 10:07:31 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_899.pt
224
+ 2026-01-25 10:07:31 | INFO | Best 3 checkpoints:
225
+ 2026-01-25 10:07:31 | INFO | 1. Step 799: acc=0.7500 (reward_model.best_799.pt)
226
+ 2026-01-25 10:07:31 | INFO | 2. Step 699: acc=0.7486 (reward_model.best_699.pt)
227
+ 2026-01-25 10:07:31 | INFO | 3. Step 599: acc=0.7457 (reward_model.best_599.pt)
228
+ 2026-01-25 10:07:33 | INFO | Step 900: loss=0.5193 | IF_loss=0.5695, MQ_loss=0.4690 | acc=0.677 (IF=0.625, MQ=0.729) | lr=0.000006
229
+ 2026-01-25 10:09:39 | INFO |
230
+ ============================================================
231
+ Validation Results (took 8.45s):
232
+ Samples: 346 instruction, 346 quality
233
+ Instruction Acc: 0.7139
234
+ Quality Acc: 0.7746
235
+ Average Acc: 0.7442
236
+ Total Loss: 0.5387
237
+ Instruction Loss: 0.5706
238
+ Quality Loss: 0.5068
239
+ ============================================================
240
+ 2026-01-25 10:09:39 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_999.pt (filtered to 21.279M trainable parameters)
241
+ 2026-01-25 10:09:39 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_999.pt (229.9MB)
242
+ 2026-01-25 10:09:39 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_999.pt
243
+ 2026-01-25 10:09:39 | INFO | Best 3 checkpoints:
244
+ 2026-01-25 10:09:39 | INFO | 1. Step 799: acc=0.7500 (reward_model.best_799.pt)
245
+ 2026-01-25 10:09:39 | INFO | 2. Step 699: acc=0.7486 (reward_model.best_699.pt)
246
+ 2026-01-25 10:09:39 | INFO | 3. Step 599: acc=0.7457 (reward_model.best_599.pt)
247
+ 2026-01-25 10:09:40 | INFO | Step 1000: loss=0.3726 | IF_loss=0.3096, MQ_loss=0.4357 | acc=0.771 (IF=0.875, MQ=0.667) | lr=0.000005
248
+ 2026-01-25 10:11:48 | INFO |
249
+ ============================================================
250
+ Validation Results (took 7.72s):
251
+ Samples: 346 instruction, 346 quality
252
+ Instruction Acc: 0.7168
253
+ Quality Acc: 0.7688
254
+ Average Acc: 0.7428
255
+ Total Loss: 0.5390
256
+ Instruction Loss: 0.5712
257
+ Quality Loss: 0.5067
258
+ ============================================================
259
+ 2026-01-25 10:11:48 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_1099.pt (filtered to 21.279M trainable parameters)
260
+ 2026-01-25 10:11:48 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_1099.pt (229.9MB)
261
+ 2026-01-25 10:11:48 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_1099.pt
262
+ 2026-01-25 10:11:48 | INFO | Best 3 checkpoints:
263
+ 2026-01-25 10:11:48 | INFO | 1. Step 799: acc=0.7500 (reward_model.best_799.pt)
264
+ 2026-01-25 10:11:48 | INFO | 2. Step 699: acc=0.7486 (reward_model.best_699.pt)
265
+ 2026-01-25 10:11:48 | INFO | 3. Step 599: acc=0.7457 (reward_model.best_599.pt)
266
+ 2026-01-25 10:11:50 | INFO | Step 1100: loss=0.4897 | IF_loss=0.5040, MQ_loss=0.4754 | acc=0.802 (IF=0.792, MQ=0.812) | lr=0.000004
267
+ 2026-01-25 10:13:53 | INFO |
268
+ ============================================================
269
+ Validation Results (took 8.01s):
270
+ Samples: 346 instruction, 346 quality
271
+ Instruction Acc: 0.7168
272
+ Quality Acc: 0.7688
273
+ Average Acc: 0.7428
274
+ Total Loss: 0.5386
275
+ Instruction Loss: 0.5713
276
+ Quality Loss: 0.5059
277
+ ============================================================
278
+ 2026-01-25 10:13:53 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_1199.pt (filtered to 21.279M trainable parameters)
279
+ 2026-01-25 10:13:53 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_1199.pt (229.9MB)
280
+ 2026-01-25 10:13:53 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_1199.pt
281
+ 2026-01-25 10:13:53 | INFO | Best 3 checkpoints:
282
+ 2026-01-25 10:13:53 | INFO | 1. Step 799: acc=0.7500 (reward_model.best_799.pt)
283
+ 2026-01-25 10:13:53 | INFO | 2. Step 699: acc=0.7486 (reward_model.best_699.pt)
284
+ 2026-01-25 10:13:53 | INFO | 3. Step 599: acc=0.7457 (reward_model.best_599.pt)
285
+ 2026-01-25 10:13:54 | INFO | Step 1200: loss=0.4865 | IF_loss=0.5833, MQ_loss=0.3896 | acc=0.750 (IF=0.708, MQ=0.792) | lr=0.000003
286
+ 2026-01-25 10:16:00 | INFO |
287
+ ============================================================
288
+ Validation Results (took 7.76s):
289
+ Samples: 346 instruction, 346 quality
290
+ Instruction Acc: 0.7139
291
+ Quality Acc: 0.7688
292
+ Average Acc: 0.7413
293
+ Total Loss: 0.5391
294
+ Instruction Loss: 0.5722
295
+ Quality Loss: 0.5060
296
+ ============================================================
297
+ 2026-01-25 10:16:01 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_1299.pt (filtered to 21.279M trainable parameters)
298
+ 2026-01-25 10:16:01 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_1299.pt (229.9MB)
299
+ 2026-01-25 10:16:01 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_1299.pt
300
+ 2026-01-25 10:16:01 | INFO | Best 3 checkpoints:
301
+ 2026-01-25 10:16:01 | INFO | 1. Step 799: acc=0.7500 (reward_model.best_799.pt)
302
+ 2026-01-25 10:16:01 | INFO | 2. Step 699: acc=0.7486 (reward_model.best_699.pt)
303
+ 2026-01-25 10:16:01 | INFO | 3. Step 599: acc=0.7457 (reward_model.best_599.pt)
304
+ 2026-01-25 10:16:02 | INFO | Step 1300: loss=0.4948 | IF_loss=0.5460, MQ_loss=0.4435 | acc=0.750 (IF=0.688, MQ=0.812) | lr=0.000003
305
+ 2026-01-25 10:18:05 | INFO |
306
+ ============================================================
307
+ Validation Results (took 8.29s):
308
+ Samples: 346 instruction, 346 quality
309
+ Instruction Acc: 0.7139
310
+ Quality Acc: 0.7717
311
+ Average Acc: 0.7428
312
+ Total Loss: 0.5395
313
+ Instruction Loss: 0.5728
314
+ Quality Loss: 0.5062
315
+ ============================================================
316
+ 2026-01-25 10:18:05 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_1399.pt (filtered to 21.279M trainable parameters)
317
+ 2026-01-25 10:18:05 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_1399.pt (229.9MB)
318
+ 2026-01-25 10:18:05 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_1399.pt
319
+ 2026-01-25 10:18:05 | INFO | Best 3 checkpoints:
320
+ 2026-01-25 10:18:05 | INFO | 1. Step 799: acc=0.7500 (reward_model.best_799.pt)
321
+ 2026-01-25 10:18:05 | INFO | 2. Step 699: acc=0.7486 (reward_model.best_699.pt)
322
+ 2026-01-25 10:18:05 | INFO | 3. Step 599: acc=0.7457 (reward_model.best_599.pt)
323
+ 2026-01-25 10:18:06 | INFO | Step 1400: loss=0.4470 | IF_loss=0.5541, MQ_loss=0.3399 | acc=0.812 (IF=0.750, MQ=0.875) | lr=0.000002
324
+ 2026-01-25 10:20:12 | INFO |
325
+ ============================================================
326
+ Validation Results (took 7.85s):
327
+ Samples: 346 instruction, 346 quality
328
+ Instruction Acc: 0.7139
329
+ Quality Acc: 0.7746
330
+ Average Acc: 0.7442
331
+ Total Loss: 0.5399
332
+ Instruction Loss: 0.5734
333
+ Quality Loss: 0.5064
334
+ ============================================================
335
+ 2026-01-25 10:20:12 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_1499.pt (filtered to 21.279M trainable parameters)
336
+ 2026-01-25 10:20:12 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_1499.pt (229.9MB)
337
+ 2026-01-25 10:20:12 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_1499.pt
338
+ 2026-01-25 10:20:12 | INFO | Best 3 checkpoints:
339
+ 2026-01-25 10:20:12 | INFO | 1. Step 799: acc=0.7500 (reward_model.best_799.pt)
340
+ 2026-01-25 10:20:12 | INFO | 2. Step 699: acc=0.7486 (reward_model.best_699.pt)
341
+ 2026-01-25 10:20:12 | INFO | 3. Step 599: acc=0.7457 (reward_model.best_599.pt)
342
+ 2026-01-25 10:20:13 | INFO | Step 1500: loss=0.3559 | IF_loss=0.4083, MQ_loss=0.3035 | acc=0.833 (IF=0.792, MQ=0.875) | lr=0.000001
343
+ 2026-01-25 10:22:17 | INFO |
344
+ ============================================================
345
+ Validation Results (took 9.35s):
346
+ Samples: 346 instruction, 346 quality
347
+ Instruction Acc: 0.7139
348
+ Quality Acc: 0.7688
349
+ Average Acc: 0.7413
350
+ Total Loss: 0.5398
351
+ Instruction Loss: 0.5737
352
+ Quality Loss: 0.5060
353
+ ============================================================
354
+ 2026-01-25 10:22:17 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_1599.pt (filtered to 21.279M trainable parameters)
355
+ 2026-01-25 10:22:17 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_1599.pt (229.9MB)
356
+ 2026-01-25 10:22:17 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_1599.pt
357
+ 2026-01-25 10:22:17 | INFO | Best 3 checkpoints:
358
+ 2026-01-25 10:22:17 | INFO | 1. Step 799: acc=0.7500 (reward_model.best_799.pt)
359
+ 2026-01-25 10:22:17 | INFO | 2. Step 699: acc=0.7486 (reward_model.best_699.pt)
360
+ 2026-01-25 10:22:17 | INFO | 3. Step 599: acc=0.7457 (reward_model.best_599.pt)
361
+ 2026-01-25 10:22:23 | INFO | Step 1600: loss=0.3699 | IF_loss=0.4525, MQ_loss=0.2873 | acc=0.875 (IF=0.854, MQ=0.896) | lr=0.000001
362
+ 2026-01-25 10:24:25 | INFO |
363
+ ============================================================
364
+ Validation Results (took 7.95s):
365
+ Samples: 346 instruction, 346 quality
366
+ Instruction Acc: 0.7110
367
+ Quality Acc: 0.7717
368
+ Average Acc: 0.7413
369
+ Total Loss: 0.5399
370
+ Instruction Loss: 0.5738
371
+ Quality Loss: 0.5060
372
+ ============================================================
373
+ 2026-01-25 10:24:25 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_1699.pt (filtered to 21.279M trainable parameters)
374
+ 2026-01-25 10:24:26 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_1699.pt (229.9MB)
375
+ 2026-01-25 10:24:26 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_1699.pt
376
+ 2026-01-25 10:24:26 | INFO | Best 3 checkpoints:
377
+ 2026-01-25 10:24:26 | INFO | 1. Step 799: acc=0.7500 (reward_model.best_799.pt)
378
+ 2026-01-25 10:24:26 | INFO | 2. Step 699: acc=0.7486 (reward_model.best_699.pt)
379
+ 2026-01-25 10:24:26 | INFO | 3. Step 599: acc=0.7457 (reward_model.best_599.pt)
380
+ 2026-01-25 10:24:27 | INFO | Step 1700: loss=0.3662 | IF_loss=0.3525, MQ_loss=0.3800 | acc=0.802 (IF=0.792, MQ=0.812) | lr=0.000001
381
+ 2026-01-25 10:26:33 | INFO |
382
+ ============================================================
383
+ Validation Results (took 7.78s):
384
+ Samples: 346 instruction, 346 quality
385
+ Instruction Acc: 0.7139
386
+ Quality Acc: 0.7717
387
+ Average Acc: 0.7428
388
+ Total Loss: 0.5401
389
+ Instruction Loss: 0.5740
390
+ Quality Loss: 0.5063
391
+ ============================================================
392
+ 2026-01-25 10:26:33 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_1799.pt (filtered to 21.279M trainable parameters)
393
+ 2026-01-25 10:26:33 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_1799.pt (229.9MB)
394
+ 2026-01-25 10:26:33 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_1799.pt
395
+ 2026-01-25 10:26:33 | INFO | Best 3 checkpoints:
396
+ 2026-01-25 10:26:33 | INFO | 1. Step 799: acc=0.7500 (reward_model.best_799.pt)
397
+ 2026-01-25 10:26:33 | INFO | 2. Step 699: acc=0.7486 (reward_model.best_699.pt)
398
+ 2026-01-25 10:26:33 | INFO | 3. Step 599: acc=0.7457 (reward_model.best_599.pt)
399
+ 2026-01-25 10:26:34 | INFO | Step 1800: loss=0.4003 | IF_loss=0.4304, MQ_loss=0.3701 | acc=0.823 (IF=0.792, MQ=0.854) | lr=0.000000
400
+ 2026-01-25 10:28:34 | INFO |
401
+ ============================================================
402
+ Validation Results (took 8.07s):
403
+ Samples: 346 instruction, 346 quality
404
+ Instruction Acc: 0.7139
405
+ Quality Acc: 0.7717
406
+ Average Acc: 0.7428
407
+ Total Loss: 0.5401
408
+ Instruction Loss: 0.5739
409
+ Quality Loss: 0.5063
410
+ ============================================================
411
+ 2026-01-25 10:28:34 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_1899.pt (filtered to 21.279M trainable parameters)
412
+ 2026-01-25 10:28:34 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_1899.pt (229.9MB)
413
+ 2026-01-25 10:28:34 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_1899.pt
414
+ 2026-01-25 10:28:34 | INFO | Best 3 checkpoints:
415
+ 2026-01-25 10:28:34 | INFO | 1. Step 799: acc=0.7500 (reward_model.best_799.pt)
416
+ 2026-01-25 10:28:34 | INFO | 2. Step 699: acc=0.7486 (reward_model.best_699.pt)
417
+ 2026-01-25 10:28:34 | INFO | 3. Step 599: acc=0.7457 (reward_model.best_599.pt)
418
+ 2026-01-25 10:28:35 | INFO | Step 1900: loss=0.4593 | IF_loss=0.4853, MQ_loss=0.4333 | acc=0.760 (IF=0.750, MQ=0.771) | lr=0.000000
419
+ 2026-01-25 10:30:41 | INFO |
420
+ ============================================================
421
+ Validation Results (took 7.71s):
422
+ Samples: 346 instruction, 346 quality
423
+ Instruction Acc: 0.7139
424
+ Quality Acc: 0.7717
425
+ Average Acc: 0.7428
426
+ Total Loss: 0.5400
427
+ Instruction Loss: 0.5738
428
+ Quality Loss: 0.5063
429
+ ============================================================
430
+ 2026-01-25 10:30:41 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_1999.pt (filtered to 21.279M trainable parameters)
431
+ 2026-01-25 10:30:41 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_1999.pt (229.9MB)
432
+ 2026-01-25 10:30:41 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_0947_CA/ckpt/reward_model.best_1999.pt
433
+ 2026-01-25 10:30:41 | INFO | Best 3 checkpoints:
434
+ 2026-01-25 10:30:41 | INFO | 1. Step 799: acc=0.7500 (reward_model.best_799.pt)
435
+ 2026-01-25 10:30:41 | INFO | 2. Step 699: acc=0.7486 (reward_model.best_699.pt)
436
+ 2026-01-25 10:30:41 | INFO | 3. Step 599: acc=0.7457 (reward_model.best_599.pt)
437
+ 2026-01-25 10:30:41 | INFO | Training complete!
438
+ 2026-01-25 10:30:41 | INFO | Training complete!
20260125_1117/config.yaml ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ DEVICES: '5'
2
+ accelerate:
3
+ mixed_precision: bf16
4
+ basics:
5
+ random_seed: 42
6
+ save_dir: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human
7
+ dataset:
8
+ audio_dropout:
9
+ apply_to_eval: true
10
+ apply_to_ref: true
11
+ enabled: true
12
+ eval_only_on_training: true
13
+ max_duration: 1500
14
+ min_duration: 200
15
+ train_mode: start
16
+ cache_dir: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/tmp
17
+ db_path: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/backend/database.db
18
+ duration: 600.0
19
+ embedding_dir: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/supervised_embeddings
20
+ max_samples: null
21
+ max_val_samples: null
22
+ metadata_jsonl: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/CMI-Training/all_comparisons.jsonl
23
+ mode: raw_text_frozen_audio
24
+ preference_file: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/CMI-Training/human_annotations/train.json
25
+ sample_rate: 24000
26
+ val_preference_file: null
27
+ loss:
28
+ IF_ratio: 0.5
29
+ filter_ties: true
30
+ label_smoothing: 0.0
31
+ reduction: mean
32
+ model:
33
+ attention_mode: SA
34
+ attn_dropout: 0.0
35
+ category_embeddings: null
36
+ dim: 768
37
+ dim_head: 64
38
+ downsample:
39
+ configs:
40
+ conv2_4x:
41
+ factor: 4
42
+ kernel_size: 5
43
+ kind: conv*2
44
+ use_layernorm: true
45
+ conv_4x:
46
+ factor: 4
47
+ kernel_size: 5
48
+ kind: conv
49
+ stage: 1
50
+ use_layernorm: true
51
+ glu_4x:
52
+ factor: 4
53
+ kernel_size: 5
54
+ kind: gluconv*2+pw
55
+ use_layernorm: true
56
+ mean:
57
+ factor: 2
58
+ kind: mean
59
+ mean_4x:
60
+ dropout: 0.0
61
+ factor: 30
62
+ kind: mean+mlp
63
+ mlp_ratio: 2.0
64
+ none:
65
+ factor: 1
66
+ kind: none
67
+ eval: mean_4x
68
+ ref: null
69
+ text: none
70
+ ff_dropout: 0.0
71
+ ff_mult: 4
72
+ freeze_audio: true
73
+ freeze_text: true
74
+ gradient_checkpointing: false
75
+ heads: 8
76
+ joint_tf_depth: 1
77
+ load_config:
78
+ checkpoint_path: null
79
+ frozen_from_pretrained: true
80
+ pretrained_name: OpenMuQ/MuQ-MuLan-large
81
+ strict: false
82
+ mlp_dim: 768
83
+ mode: concat_text_late
84
+ model_name: OpenMuQ/MuQ-MuLan-large
85
+ name: reward
86
+ no_condition: false
87
+ null_embedding:
88
+ audio:
89
+ dropout: 0.5
90
+ length: 10
91
+ lyrics:
92
+ dropout: 0.3
93
+ length: 10
94
+ text:
95
+ dropout: 0
96
+ length: 10
97
+ output_dim: 2
98
+ prompt_tf_depth: 4
99
+ sr: 24000
100
+ text_encoder:
101
+ name: muq_mulan
102
+ tune: null
103
+ text_lora_config: null
104
+ train_muq_depth: 0
105
+ train_muqmulan: false
106
+ use_audio: true
107
+ use_layer_idx: -1
108
+ project_root: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena
109
+ run_name: null
110
+ train:
111
+ batch_size: 48
112
+ betas:
113
+ - 0.9
114
+ - 0.99
115
+ ema_decay: 0.9999
116
+ ema_update_every: 1
117
+ enable_gradient_checkpointing: true
118
+ force_clear_prev_results: false
119
+ grad_accum_every: 1
120
+ log_tensorboard: true
121
+ lr_schedule:
122
+ min_lr_ratio: 0.001
123
+ name: linear_cosine
124
+ total_steps: 2000
125
+ warmup_steps: 10
126
+ max_grad_norm: 1
127
+ mlp_lr: 1.0e-05
128
+ num_train_steps: 2000
129
+ num_valid_batches: null
130
+ num_workers: 8
131
+ other_lr: 1.0e-05
132
+ resume: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/contrastive/20260123_1403_tune_mulan_transformer/ckpt/reward_model.best_27252.pt
133
+ resume_optimizer: false
134
+ save_model_every: 2000
135
+ use_checkpoint_config: true
136
+ use_ema: false
137
+ use_lion: false
138
+ valid_batch_size: 20
139
+ valid_every: 100
140
+ valid_frac: 0.1
141
+ verify_weights_on_load: true
142
+ validate_only: false
20260125_1117/reward_model/1769311084.1305242/events.out.tfevents.1769311084.MACLAB-S004.2009526.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:822b6fceabde39473c4a729e682f16fe698e1ddda674a89dfd54e6dae8b6b5bc
3
+ size 503
20260125_1117/reward_model/1769311084.1322424/hparams.yml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ batch_size: 48
2
+ grad_accum_every: 1
3
+ learning_rate: 1.0e-05
4
+ num_train_steps: 2000
20260125_1117/reward_model/events.out.tfevents.1769311084.MACLAB-S004.2009526.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5324959f8755a329daa5765d181b2712b8dc36505a84bd25731dd5d0b8969191
3
+ size 874266
20260125_1117/train.20260125_1117.log ADDED
@@ -0,0 +1,441 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-25 11:17:53 | INFO | Log file: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/train.20260125_1117.log
2
+ 2026-01-25 11:17:53 | INFO | Random seed set to 42
3
+ 2026-01-25 11:17:54 | INFO | Created RawTextFrozenAudioDataset with 3463 samples
4
+ 2026-01-25 11:17:54 | INFO | Split dataset into train (3117) and validation (346) sets (ratio: 10.00%)
5
+ 2026-01-25 11:17:54 | INFO | Will resume from checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/contrastive/20260123_1403_tune_mulan_transformer/ckpt/reward_model.best_27252.pt
6
+ 2026-01-25 11:17:54 | INFO | Using checkpoint config for model initialization (continue training mode)
7
+ 2026-01-25 11:18:01 | INFO | Created RewardAttentionModel with attention_mode=CA
8
+ 2026-01-25 11:18:01 | INFO | Created PreferenceLoss with filter_ties=True
9
+ 2026-01-25 11:18:02 | INFO | ✓ Gradient checkpointing enabled
10
+ 2026-01-25 11:18:02 | INFO | ✓ Audio cropping enabled: min=200, max=1500
11
+ 2026-01-25 11:18:02 | INFO | Apply to eval: True, ref: True
12
+ 2026-01-25 11:18:02 | INFO | Modes: train=random, val=start
13
+ 2026-01-25 11:18:02 | INFO | MLP head parameters: 1,186,563 params, lr=1e-05
14
+ 2026-01-25 11:18:02 | INFO | Other parameters: 70,437,634 params, lr=1e-05
15
+ 2026-01-25 11:18:02 | INFO | Using lr_schedule=linear_cosine warmup_steps=10 total_steps=2000
16
+ 2026-01-25 11:18:02 | INFO | Training with fixed validation set
17
+ 2026-01-25 11:18:02 | INFO | Train batch_size: 48, Valid batch_size: 20
18
+ 2026-01-25 11:18:03 | INFO | Skipping score_projector.3.weight: shape mismatch (ckpt torch.Size([1, 768]) vs model torch.Size([2, 768])), will use randomly initialized weights
19
+ 2026-01-25 11:18:03 | INFO | Skipping score_projector.3.bias: shape mismatch (ckpt torch.Size([1]) vs model torch.Size([2])), will use randomly initialized weights
20
+ 2026-01-25 11:18:03 | INFO | Missing keys (712): ['score_projector.3.weight', 'score_projector.3.bias', 'text_module.model.embeddings.word_embeddings.weight', 'text_module.model.embeddings.position_embeddings.weight', 'text_module.model.embeddings.token_type_embeddings.weight']...
21
+ 2026-01-25 11:18:03 | INFO | ✓ Starting from step 0 (transfer learning mode, ignoring checkpoint steps=27252)
22
+ 2026-01-25 11:18:03 | INFO | Resumed from /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/contrastive/20260123_1403_tune_mulan_transformer/ckpt/reward_model.best_27252.pt
23
+ 2026-01-25 11:18:03 | INFO | Parameters: 683.857M total, 71.624M trainable
24
+ 2026-01-25 11:18:03 | INFO | Text encoder (frozen): 278.044M
25
+ 2026-01-25 11:18:03 | INFO | Text encoder (trainable): 50.345M
26
+ 2026-01-25 11:18:03 | INFO | Audio encoder (frozen): 334.189M
27
+ 2026-01-25 11:18:03 | INFO | Other trainable: 21.279M
28
+ 2026-01-25 11:18:03 | INFO | ℹ No LoRA configuration detected
29
+ 2026-01-25 11:18:04 | INFO | ============================================================
30
+ 2026-01-25 11:18:04 | INFO | Ready to start training
31
+ 2026-01-25 11:18:04 | INFO | ============================================================
32
+ 2026-01-25 11:18:04 | INFO | Starting training from step 0
33
+ 2026-01-25 11:18:04 | INFO | ===== Accelerator / CUDA Debug Info =====
34
+ 2026-01-25 11:18:04 | INFO | accelerator.device = cuda
35
+ 2026-01-25 11:18:04 | INFO | mixed_precision = bf16
36
+ 2026-01-25 11:18:04 | INFO | distributed_type = NO
37
+ 2026-01-25 11:18:04 | INFO | num_processes = 1
38
+ 2026-01-25 11:18:04 | INFO | process_index = 0
39
+ 2026-01-25 11:18:04 | INFO | is_main_process = True
40
+ 2026-01-25 11:18:04 | INFO | torch.cuda.is_available() = True
41
+ 2026-01-25 11:18:04 | INFO | torch.cuda.device_count() = 1
42
+ 2026-01-25 11:18:04 | INFO | current_device = 0
43
+ 2026-01-25 11:18:04 | INFO | device_name = NVIDIA GeForce RTX 4090
44
+ 2026-01-25 11:18:04 | INFO | model parameter device = cuda:0
45
+ 2026-01-25 11:18:04 | INFO | Training for 2000.0 steps (~32 epochs, 64 steps/epoch)
46
+ 2026-01-25 11:18:04 | INFO |
47
+ ============================================================
48
+ 2026-01-25 11:18:04 | INFO | Running initial validation after resume...
49
+ 2026-01-25 11:18:04 | INFO | ============================================================
50
+ 2026-01-25 11:18:17 | INFO |
51
+ ============================================================
52
+ Validation Results (took 13.32s):
53
+ Samples: 346 instruction, 346 quality
54
+ Instruction Acc: 0.5405
55
+ Quality Acc: 0.4740
56
+ Average Acc: 0.5072
57
+ Total Loss: 0.6589
58
+ Instruction Loss: 0.6537
59
+ Quality Loss: 0.6641
60
+ ============================================================
61
+ 2026-01-25 11:18:17 | INFO | Initial validation complete.
62
+
63
+ 2026-01-25 11:18:22 | INFO | Step 0: loss=0.6765 | IF_loss=0.6586, MQ_loss=0.6944 | acc=0.552 (IF=0.604, MQ=0.500) | lr=0.000002
64
+ 2026-01-25 11:18:23 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.0.pt (filtered to 71.624M trainable parameters)
65
+ 2026-01-25 11:18:23 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.0.pt (422.0MB)
66
+ 2026-01-25 11:18:23 | INFO | Step 0: Saved to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.0.pt
67
+ 2026-01-25 11:20:31 | INFO |
68
+ ============================================================
69
+ Validation Results (took 8.99s):
70
+ Samples: 346 instruction, 346 quality
71
+ Instruction Acc: 0.6763
72
+ Quality Acc: 0.6965
73
+ Average Acc: 0.6864
74
+ Total Loss: 0.5779
75
+ Instruction Loss: 0.6022
76
+ Quality Loss: 0.5537
77
+ ============================================================
78
+ 2026-01-25 11:20:31 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_99.pt (filtered to 71.624M trainable parameters)
79
+ 2026-01-25 11:20:31 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_99.pt (422.0MB)
80
+ 2026-01-25 11:20:31 | INFO | Best 1 checkpoints:
81
+ 2026-01-25 11:20:31 | INFO | 1. Step 99: acc=0.6864 (reward_model.best_99.pt)
82
+ 2026-01-25 11:20:32 | INFO | Step 100: loss=0.5965 | IF_loss=0.6046, MQ_loss=0.5884 | acc=0.688 (IF=0.646, MQ=0.729) | lr=0.000010
83
+ 2026-01-25 11:22:45 | INFO |
84
+ ============================================================
85
+ Validation Results (took 9.25s):
86
+ Samples: 346 instruction, 346 quality
87
+ Instruction Acc: 0.6821
88
+ Quality Acc: 0.7110
89
+ Average Acc: 0.6965
90
+ Total Loss: 0.5492
91
+ Instruction Loss: 0.5805
92
+ Quality Loss: 0.5180
93
+ ============================================================
94
+ 2026-01-25 11:22:45 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_199.pt (filtered to 71.624M trainable parameters)
95
+ 2026-01-25 11:22:45 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_199.pt (422.0MB)
96
+ 2026-01-25 11:22:45 | INFO | Best 2 checkpoints:
97
+ 2026-01-25 11:22:45 | INFO | 1. Step 199: acc=0.6965 (reward_model.best_199.pt)
98
+ 2026-01-25 11:22:45 | INFO | 2. Step 99: acc=0.6864 (reward_model.best_99.pt)
99
+ 2026-01-25 11:22:46 | INFO | Step 200: loss=0.5629 | IF_loss=0.5719, MQ_loss=0.5538 | acc=0.708 (IF=0.667, MQ=0.750) | lr=0.000010
100
+ 2026-01-25 11:24:58 | INFO |
101
+ ============================================================
102
+ Validation Results (took 8.81s):
103
+ Samples: 346 instruction, 346 quality
104
+ Instruction Acc: 0.6879
105
+ Quality Acc: 0.7370
106
+ Average Acc: 0.7124
107
+ Total Loss: 0.5398
108
+ Instruction Loss: 0.5714
109
+ Quality Loss: 0.5082
110
+ ============================================================
111
+ 2026-01-25 11:24:58 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_299.pt (filtered to 71.624M trainable parameters)
112
+ 2026-01-25 11:24:58 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_299.pt (422.0MB)
113
+ 2026-01-25 11:24:58 | INFO | Best 3 checkpoints:
114
+ 2026-01-25 11:24:58 | INFO | 1. Step 299: acc=0.7124 (reward_model.best_299.pt)
115
+ 2026-01-25 11:24:58 | INFO | 2. Step 199: acc=0.6965 (reward_model.best_199.pt)
116
+ 2026-01-25 11:24:58 | INFO | 3. Step 99: acc=0.6864 (reward_model.best_99.pt)
117
+ 2026-01-25 11:24:59 | INFO | Step 300: loss=0.4994 | IF_loss=0.5058, MQ_loss=0.4929 | acc=0.802 (IF=0.812, MQ=0.792) | lr=0.000009
118
+ 2026-01-25 11:27:15 | INFO |
119
+ ============================================================
120
+ Validation Results (took 8.79s):
121
+ Samples: 346 instruction, 346 quality
122
+ Instruction Acc: 0.7023
123
+ Quality Acc: 0.7312
124
+ Average Acc: 0.7168
125
+ Total Loss: 0.5318
126
+ Instruction Loss: 0.5617
127
+ Quality Loss: 0.5019
128
+ ============================================================
129
+ 2026-01-25 11:27:15 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_399.pt (filtered to 71.624M trainable parameters)
130
+ 2026-01-25 11:27:16 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_399.pt (422.0MB)
131
+ 2026-01-25 11:27:16 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_99.pt
132
+ 2026-01-25 11:27:16 | INFO | Best 3 checkpoints:
133
+ 2026-01-25 11:27:16 | INFO | 1. Step 399: acc=0.7168 (reward_model.best_399.pt)
134
+ 2026-01-25 11:27:16 | INFO | 2. Step 299: acc=0.7124 (reward_model.best_299.pt)
135
+ 2026-01-25 11:27:16 | INFO | 3. Step 199: acc=0.6965 (reward_model.best_199.pt)
136
+ 2026-01-25 11:27:17 | INFO | Step 400: loss=0.4955 | IF_loss=0.4808, MQ_loss=0.5101 | acc=0.698 (IF=0.667, MQ=0.729) | lr=0.000009
137
+ 2026-01-25 11:29:28 | INFO |
138
+ ============================================================
139
+ Validation Results (took 9.06s):
140
+ Samples: 346 instruction, 346 quality
141
+ Instruction Acc: 0.7110
142
+ Quality Acc: 0.7254
143
+ Average Acc: 0.7182
144
+ Total Loss: 0.5266
145
+ Instruction Loss: 0.5560
146
+ Quality Loss: 0.4972
147
+ ============================================================
148
+ 2026-01-25 11:29:28 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_499.pt (filtered to 71.624M trainable parameters)
149
+ 2026-01-25 11:29:28 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_499.pt (422.0MB)
150
+ 2026-01-25 11:29:28 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_199.pt
151
+ 2026-01-25 11:29:28 | INFO | Best 3 checkpoints:
152
+ 2026-01-25 11:29:28 | INFO | 1. Step 499: acc=0.7182 (reward_model.best_499.pt)
153
+ 2026-01-25 11:29:28 | INFO | 2. Step 399: acc=0.7168 (reward_model.best_399.pt)
154
+ 2026-01-25 11:29:28 | INFO | 3. Step 299: acc=0.7124 (reward_model.best_299.pt)
155
+ 2026-01-25 11:29:29 | INFO | Step 500: loss=0.4977 | IF_loss=0.5734, MQ_loss=0.4219 | acc=0.688 (IF=0.667, MQ=0.708) | lr=0.000009
156
+ 2026-01-25 11:31:41 | INFO |
157
+ ============================================================
158
+ Validation Results (took 8.66s):
159
+ Samples: 346 instruction, 346 quality
160
+ Instruction Acc: 0.7168
161
+ Quality Acc: 0.7283
162
+ Average Acc: 0.7225
163
+ Total Loss: 0.5231
164
+ Instruction Loss: 0.5528
165
+ Quality Loss: 0.4934
166
+ ============================================================
167
+ 2026-01-25 11:31:41 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_599.pt (filtered to 71.624M trainable parameters)
168
+ 2026-01-25 11:31:41 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_599.pt (422.0MB)
169
+ 2026-01-25 11:31:41 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_299.pt
170
+ 2026-01-25 11:31:41 | INFO | Best 3 checkpoints:
171
+ 2026-01-25 11:31:41 | INFO | 1. Step 599: acc=0.7225 (reward_model.best_599.pt)
172
+ 2026-01-25 11:31:41 | INFO | 2. Step 499: acc=0.7182 (reward_model.best_499.pt)
173
+ 2026-01-25 11:31:41 | INFO | 3. Step 399: acc=0.7168 (reward_model.best_399.pt)
174
+ 2026-01-25 11:31:43 | INFO | Step 600: loss=0.5072 | IF_loss=0.4980, MQ_loss=0.5164 | acc=0.698 (IF=0.688, MQ=0.708) | lr=0.000008
175
+ 2026-01-25 11:33:51 | INFO |
176
+ ============================================================
177
+ Validation Results (took 9.03s):
178
+ Samples: 346 instruction, 346 quality
179
+ Instruction Acc: 0.7168
180
+ Quality Acc: 0.7341
181
+ Average Acc: 0.7254
182
+ Total Loss: 0.5201
183
+ Instruction Loss: 0.5493
184
+ Quality Loss: 0.4909
185
+ ============================================================
186
+ 2026-01-25 11:33:52 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_699.pt (filtered to 71.624M trainable parameters)
187
+ 2026-01-25 11:33:52 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_699.pt (422.0MB)
188
+ 2026-01-25 11:33:52 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_399.pt
189
+ 2026-01-25 11:33:52 | INFO | Best 3 checkpoints:
190
+ 2026-01-25 11:33:52 | INFO | 1. Step 699: acc=0.7254 (reward_model.best_699.pt)
191
+ 2026-01-25 11:33:52 | INFO | 2. Step 599: acc=0.7225 (reward_model.best_599.pt)
192
+ 2026-01-25 11:33:52 | INFO | 3. Step 499: acc=0.7182 (reward_model.best_499.pt)
193
+ 2026-01-25 11:33:53 | INFO | Step 700: loss=0.4063 | IF_loss=0.4648, MQ_loss=0.3477 | acc=0.833 (IF=0.812, MQ=0.854) | lr=0.000007
194
+ 2026-01-25 11:36:07 | INFO |
195
+ ============================================================
196
+ Validation Results (took 8.47s):
197
+ Samples: 346 instruction, 346 quality
198
+ Instruction Acc: 0.7168
199
+ Quality Acc: 0.7341
200
+ Average Acc: 0.7254
201
+ Total Loss: 0.5200
202
+ Instruction Loss: 0.5501
203
+ Quality Loss: 0.4900
204
+ ============================================================
205
+ 2026-01-25 11:36:07 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_799.pt (filtered to 71.624M trainable parameters)
206
+ 2026-01-25 11:36:07 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_799.pt (422.0MB)
207
+ 2026-01-25 11:36:07 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_499.pt
208
+ 2026-01-25 11:36:07 | INFO | Best 3 checkpoints:
209
+ 2026-01-25 11:36:07 | INFO | 1. Step 699: acc=0.7254 (reward_model.best_699.pt)
210
+ 2026-01-25 11:36:07 | INFO | 2. Step 799: acc=0.7254 (reward_model.best_799.pt)
211
+ 2026-01-25 11:36:07 | INFO | 3. Step 599: acc=0.7225 (reward_model.best_599.pt)
212
+ 2026-01-25 11:36:08 | INFO | Step 800: loss=0.4288 | IF_loss=0.4825, MQ_loss=0.3751 | acc=0.740 (IF=0.688, MQ=0.792) | lr=0.000007
213
+ 2026-01-25 11:38:19 | INFO |
214
+ ============================================================
215
+ Validation Results (took 8.31s):
216
+ Samples: 346 instruction, 346 quality
217
+ Instruction Acc: 0.7225
218
+ Quality Acc: 0.7370
219
+ Average Acc: 0.7298
220
+ Total Loss: 0.5181
221
+ Instruction Loss: 0.5471
222
+ Quality Loss: 0.4891
223
+ ============================================================
224
+ 2026-01-25 11:38:19 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_899.pt (filtered to 71.624M trainable parameters)
225
+ 2026-01-25 11:38:19 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_899.pt (422.0MB)
226
+ 2026-01-25 11:38:19 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_599.pt
227
+ 2026-01-25 11:38:19 | INFO | Best 3 checkpoints:
228
+ 2026-01-25 11:38:19 | INFO | 1. Step 899: acc=0.7298 (reward_model.best_899.pt)
229
+ 2026-01-25 11:38:19 | INFO | 2. Step 699: acc=0.7254 (reward_model.best_699.pt)
230
+ 2026-01-25 11:38:19 | INFO | 3. Step 799: acc=0.7254 (reward_model.best_799.pt)
231
+ 2026-01-25 11:38:21 | INFO | Step 900: loss=0.5461 | IF_loss=0.6051, MQ_loss=0.4871 | acc=0.708 (IF=0.625, MQ=0.792) | lr=0.000006
232
+ 2026-01-25 11:40:29 | INFO |
233
+ ============================================================
234
+ Validation Results (took 9.00s):
235
+ Samples: 346 instruction, 346 quality
236
+ Instruction Acc: 0.7225
237
+ Quality Acc: 0.7370
238
+ Average Acc: 0.7298
239
+ Total Loss: 0.5177
240
+ Instruction Loss: 0.5469
241
+ Quality Loss: 0.4885
242
+ ============================================================
243
+ 2026-01-25 11:40:30 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_999.pt (filtered to 71.624M trainable parameters)
244
+ 2026-01-25 11:40:30 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_999.pt (422.0MB)
245
+ 2026-01-25 11:40:30 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_799.pt
246
+ 2026-01-25 11:40:30 | INFO | Best 3 checkpoints:
247
+ 2026-01-25 11:40:30 | INFO | 1. Step 899: acc=0.7298 (reward_model.best_899.pt)
248
+ 2026-01-25 11:40:30 | INFO | 2. Step 999: acc=0.7298 (reward_model.best_999.pt)
249
+ 2026-01-25 11:40:30 | INFO | 3. Step 699: acc=0.7254 (reward_model.best_699.pt)
250
+ 2026-01-25 11:40:31 | INFO | Step 1000: loss=0.4418 | IF_loss=0.4662, MQ_loss=0.4175 | acc=0.708 (IF=0.688, MQ=0.729) | lr=0.000005
251
+ 2026-01-25 11:42:40 | INFO |
252
+ ============================================================
253
+ Validation Results (took 8.11s):
254
+ Samples: 346 instruction, 346 quality
255
+ Instruction Acc: 0.7168
256
+ Quality Acc: 0.7370
257
+ Average Acc: 0.7269
258
+ Total Loss: 0.5173
259
+ Instruction Loss: 0.5461
260
+ Quality Loss: 0.4885
261
+ ============================================================
262
+ 2026-01-25 11:42:41 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_1099.pt (filtered to 71.624M trainable parameters)
263
+ 2026-01-25 11:42:41 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_1099.pt (422.0MB)
264
+ 2026-01-25 11:42:41 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_699.pt
265
+ 2026-01-25 11:42:41 | INFO | Best 3 checkpoints:
266
+ 2026-01-25 11:42:41 | INFO | 1. Step 899: acc=0.7298 (reward_model.best_899.pt)
267
+ 2026-01-25 11:42:41 | INFO | 2. Step 999: acc=0.7298 (reward_model.best_999.pt)
268
+ 2026-01-25 11:42:41 | INFO | 3. Step 1099: acc=0.7269 (reward_model.best_1099.pt)
269
+ 2026-01-25 11:42:42 | INFO | Step 1100: loss=0.4653 | IF_loss=0.5016, MQ_loss=0.4290 | acc=0.760 (IF=0.708, MQ=0.812) | lr=0.000004
270
+ 2026-01-25 11:44:51 | INFO |
271
+ ============================================================
272
+ Validation Results (took 8.91s):
273
+ Samples: 346 instruction, 346 quality
274
+ Instruction Acc: 0.7254
275
+ Quality Acc: 0.7370
276
+ Average Acc: 0.7312
277
+ Total Loss: 0.5172
278
+ Instruction Loss: 0.5464
279
+ Quality Loss: 0.4879
280
+ ============================================================
281
+ 2026-01-25 11:44:51 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_1199.pt (filtered to 71.624M trainable parameters)
282
+ 2026-01-25 11:44:52 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_1199.pt (422.0MB)
283
+ 2026-01-25 11:44:52 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_1099.pt
284
+ 2026-01-25 11:44:52 | INFO | Best 3 checkpoints:
285
+ 2026-01-25 11:44:52 | INFO | 1. Step 1199: acc=0.7312 (reward_model.best_1199.pt)
286
+ 2026-01-25 11:44:52 | INFO | 2. Step 899: acc=0.7298 (reward_model.best_899.pt)
287
+ 2026-01-25 11:44:52 | INFO | 3. Step 999: acc=0.7298 (reward_model.best_999.pt)
288
+ 2026-01-25 11:44:53 | INFO | Step 1200: loss=0.5002 | IF_loss=0.5816, MQ_loss=0.4188 | acc=0.760 (IF=0.688, MQ=0.833) | lr=0.000003
289
+ 2026-01-25 11:47:08 | INFO |
290
+ ============================================================
291
+ Validation Results (took 8.34s):
292
+ Samples: 346 instruction, 346 quality
293
+ Instruction Acc: 0.7254
294
+ Quality Acc: 0.7399
295
+ Average Acc: 0.7327
296
+ Total Loss: 0.5170
297
+ Instruction Loss: 0.5456
298
+ Quality Loss: 0.4884
299
+ ============================================================
300
+ 2026-01-25 11:47:08 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_1299.pt (filtered to 71.624M trainable parameters)
301
+ 2026-01-25 11:47:08 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_1299.pt (422.0MB)
302
+ 2026-01-25 11:47:08 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_999.pt
303
+ 2026-01-25 11:47:08 | INFO | Best 3 checkpoints:
304
+ 2026-01-25 11:47:08 | INFO | 1. Step 1299: acc=0.7327 (reward_model.best_1299.pt)
305
+ 2026-01-25 11:47:08 | INFO | 2. Step 1199: acc=0.7312 (reward_model.best_1199.pt)
306
+ 2026-01-25 11:47:08 | INFO | 3. Step 899: acc=0.7298 (reward_model.best_899.pt)
307
+ 2026-01-25 11:47:10 | INFO | Step 1300: loss=0.5330 | IF_loss=0.6011, MQ_loss=0.4650 | acc=0.729 (IF=0.688, MQ=0.771) | lr=0.000003
308
+ 2026-01-25 11:49:22 | INFO |
309
+ ============================================================
310
+ Validation Results (took 8.80s):
311
+ Samples: 346 instruction, 346 quality
312
+ Instruction Acc: 0.7254
313
+ Quality Acc: 0.7370
314
+ Average Acc: 0.7312
315
+ Total Loss: 0.5172
316
+ Instruction Loss: 0.5459
317
+ Quality Loss: 0.4884
318
+ ============================================================
319
+ 2026-01-25 11:49:22 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_1399.pt (filtered to 71.624M trainable parameters)
320
+ 2026-01-25 11:49:23 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_1399.pt (422.0MB)
321
+ 2026-01-25 11:49:23 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_899.pt
322
+ 2026-01-25 11:49:23 | INFO | Best 3 checkpoints:
323
+ 2026-01-25 11:49:23 | INFO | 1. Step 1299: acc=0.7327 (reward_model.best_1299.pt)
324
+ 2026-01-25 11:49:23 | INFO | 2. Step 1199: acc=0.7312 (reward_model.best_1199.pt)
325
+ 2026-01-25 11:49:23 | INFO | 3. Step 1399: acc=0.7312 (reward_model.best_1399.pt)
326
+ 2026-01-25 11:49:24 | INFO | Step 1400: loss=0.4927 | IF_loss=0.5769, MQ_loss=0.4085 | acc=0.708 (IF=0.667, MQ=0.750) | lr=0.000002
327
+ 2026-01-25 11:51:38 | INFO |
328
+ ============================================================
329
+ Validation Results (took 9.00s):
330
+ Samples: 346 instruction, 346 quality
331
+ Instruction Acc: 0.7254
332
+ Quality Acc: 0.7370
333
+ Average Acc: 0.7312
334
+ Total Loss: 0.5166
335
+ Instruction Loss: 0.5454
336
+ Quality Loss: 0.4878
337
+ ============================================================
338
+ 2026-01-25 11:51:38 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_1499.pt (filtered to 71.624M trainable parameters)
339
+ 2026-01-25 11:51:38 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_1499.pt (422.0MB)
340
+ 2026-01-25 11:51:38 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_1499.pt
341
+ 2026-01-25 11:51:38 | INFO | Best 3 checkpoints:
342
+ 2026-01-25 11:51:38 | INFO | 1. Step 1299: acc=0.7327 (reward_model.best_1299.pt)
343
+ 2026-01-25 11:51:38 | INFO | 2. Step 1199: acc=0.7312 (reward_model.best_1199.pt)
344
+ 2026-01-25 11:51:38 | INFO | 3. Step 1399: acc=0.7312 (reward_model.best_1399.pt)
345
+ 2026-01-25 11:51:39 | INFO | Step 1500: loss=0.4455 | IF_loss=0.4833, MQ_loss=0.4076 | acc=0.750 (IF=0.708, MQ=0.792) | lr=0.000001
346
+ 2026-01-25 11:53:52 | INFO |
347
+ ============================================================
348
+ Validation Results (took 9.35s):
349
+ Samples: 346 instruction, 346 quality
350
+ Instruction Acc: 0.7254
351
+ Quality Acc: 0.7370
352
+ Average Acc: 0.7312
353
+ Total Loss: 0.5173
354
+ Instruction Loss: 0.5462
355
+ Quality Loss: 0.4884
356
+ ============================================================
357
+ 2026-01-25 11:53:52 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_1599.pt (filtered to 71.624M trainable parameters)
358
+ 2026-01-25 11:53:53 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_1599.pt (422.0MB)
359
+ 2026-01-25 11:53:53 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_1599.pt
360
+ 2026-01-25 11:53:53 | INFO | Best 3 checkpoints:
361
+ 2026-01-25 11:53:53 | INFO | 1. Step 1299: acc=0.7327 (reward_model.best_1299.pt)
362
+ 2026-01-25 11:53:53 | INFO | 2. Step 1199: acc=0.7312 (reward_model.best_1199.pt)
363
+ 2026-01-25 11:53:53 | INFO | 3. Step 1399: acc=0.7312 (reward_model.best_1399.pt)
364
+ 2026-01-25 11:53:57 | INFO | Step 1600: loss=0.4107 | IF_loss=0.4762, MQ_loss=0.3453 | acc=0.823 (IF=0.833, MQ=0.812) | lr=0.000001
365
+ 2026-01-25 11:56:08 | INFO |
366
+ ============================================================
367
+ Validation Results (took 9.61s):
368
+ Samples: 346 instruction, 346 quality
369
+ Instruction Acc: 0.7254
370
+ Quality Acc: 0.7341
371
+ Average Acc: 0.7298
372
+ Total Loss: 0.5172
373
+ Instruction Loss: 0.5463
374
+ Quality Loss: 0.4881
375
+ ============================================================
376
+ 2026-01-25 11:56:08 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_1699.pt (filtered to 71.624M trainable parameters)
377
+ 2026-01-25 11:56:09 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_1699.pt (422.0MB)
378
+ 2026-01-25 11:56:09 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_1699.pt
379
+ 2026-01-25 11:56:09 | INFO | Best 3 checkpoints:
380
+ 2026-01-25 11:56:09 | INFO | 1. Step 1299: acc=0.7327 (reward_model.best_1299.pt)
381
+ 2026-01-25 11:56:09 | INFO | 2. Step 1199: acc=0.7312 (reward_model.best_1199.pt)
382
+ 2026-01-25 11:56:09 | INFO | 3. Step 1399: acc=0.7312 (reward_model.best_1399.pt)
383
+ 2026-01-25 11:56:10 | INFO | Step 1700: loss=0.4612 | IF_loss=0.4737, MQ_loss=0.4487 | acc=0.802 (IF=0.750, MQ=0.854) | lr=0.000001
384
+ 2026-01-25 11:58:26 | INFO |
385
+ ============================================================
386
+ Validation Results (took 9.02s):
387
+ Samples: 346 instruction, 346 quality
388
+ Instruction Acc: 0.7254
389
+ Quality Acc: 0.7341
390
+ Average Acc: 0.7298
391
+ Total Loss: 0.5173
392
+ Instruction Loss: 0.5463
393
+ Quality Loss: 0.4883
394
+ ============================================================
395
+ 2026-01-25 11:58:26 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_1799.pt (filtered to 71.624M trainable parameters)
396
+ 2026-01-25 11:58:26 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_1799.pt (422.0MB)
397
+ 2026-01-25 11:58:26 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_1799.pt
398
+ 2026-01-25 11:58:26 | INFO | Best 3 checkpoints:
399
+ 2026-01-25 11:58:26 | INFO | 1. Step 1299: acc=0.7327 (reward_model.best_1299.pt)
400
+ 2026-01-25 11:58:26 | INFO | 2. Step 1199: acc=0.7312 (reward_model.best_1199.pt)
401
+ 2026-01-25 11:58:26 | INFO | 3. Step 1399: acc=0.7312 (reward_model.best_1399.pt)
402
+ 2026-01-25 11:58:27 | INFO | Step 1800: loss=0.4209 | IF_loss=0.4485, MQ_loss=0.3933 | acc=0.833 (IF=0.833, MQ=0.833) | lr=0.000000
403
+ 2026-01-25 12:00:38 | INFO |
404
+ ============================================================
405
+ Validation Results (took 9.32s):
406
+ Samples: 346 instruction, 346 quality
407
+ Instruction Acc: 0.7283
408
+ Quality Acc: 0.7341
409
+ Average Acc: 0.7312
410
+ Total Loss: 0.5172
411
+ Instruction Loss: 0.5464
412
+ Quality Loss: 0.4881
413
+ ============================================================
414
+ 2026-01-25 12:00:38 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_1899.pt (filtered to 71.624M trainable parameters)
415
+ 2026-01-25 12:00:39 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_1899.pt (422.0MB)
416
+ 2026-01-25 12:00:39 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_1899.pt
417
+ 2026-01-25 12:00:39 | INFO | Best 3 checkpoints:
418
+ 2026-01-25 12:00:39 | INFO | 1. Step 1299: acc=0.7327 (reward_model.best_1299.pt)
419
+ 2026-01-25 12:00:39 | INFO | 2. Step 1199: acc=0.7312 (reward_model.best_1199.pt)
420
+ 2026-01-25 12:00:39 | INFO | 3. Step 1399: acc=0.7312 (reward_model.best_1399.pt)
421
+ 2026-01-25 12:00:40 | INFO | Step 1900: loss=0.5161 | IF_loss=0.5734, MQ_loss=0.4587 | acc=0.688 (IF=0.646, MQ=0.729) | lr=0.000000
422
+ 2026-01-25 12:02:54 | INFO |
423
+ ============================================================
424
+ Validation Results (took 8.56s):
425
+ Samples: 346 instruction, 346 quality
426
+ Instruction Acc: 0.7283
427
+ Quality Acc: 0.7341
428
+ Average Acc: 0.7312
429
+ Total Loss: 0.5172
430
+ Instruction Loss: 0.5463
431
+ Quality Loss: 0.4881
432
+ ============================================================
433
+ 2026-01-25 12:02:54 | INFO | Saving checkpoint to /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_1999.pt (filtered to 71.624M trainable parameters)
434
+ 2026-01-25 12:02:54 | INFO | Checkpoint saved: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_1999.pt (422.0MB)
435
+ 2026-01-25 12:02:54 | INFO | Removed old checkpoint: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1117/ckpt/reward_model.best_1999.pt
436
+ 2026-01-25 12:02:54 | INFO | Best 3 checkpoints:
437
+ 2026-01-25 12:02:54 | INFO | 1. Step 1299: acc=0.7327 (reward_model.best_1299.pt)
438
+ 2026-01-25 12:02:54 | INFO | 2. Step 1199: acc=0.7312 (reward_model.best_1199.pt)
439
+ 2026-01-25 12:02:54 | INFO | 3. Step 1399: acc=0.7312 (reward_model.best_1399.pt)
440
+ 2026-01-25 12:02:54 | INFO | Training complete!
441
+ 2026-01-25 12:02:54 | INFO | Training complete!
20260125_1231/config.yaml ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ DEVICES: '5'
2
+ accelerate:
3
+ mixed_precision: bf16
4
+ basics:
5
+ random_seed: 42
6
+ save_dir: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human
7
+ dataset:
8
+ audio_dropout:
9
+ apply_to_eval: true
10
+ apply_to_ref: true
11
+ enabled: true
12
+ eval_only_on_training: true
13
+ max_duration: 1500
14
+ min_duration: 200
15
+ train_mode: start
16
+ cache_dir: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/tmp
17
+ db_path: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/backend/database.db
18
+ duration: 600.0
19
+ embedding_dir: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/supervised_embeddings
20
+ max_samples: null
21
+ max_val_samples: null
22
+ metadata_jsonl: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/CMI-Training/all_comparisons.jsonl
23
+ mode: raw_text_frozen_audio
24
+ preference_file: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/CMI-Training/human_annotations/train.json
25
+ sample_rate: 24000
26
+ val_preference_file: null
27
+ loss:
28
+ IF_ratio: 0.5
29
+ filter_ties: true
30
+ label_smoothing: 0.0
31
+ reduction: mean
32
+ model:
33
+ attention_mode: SA
34
+ attn_dropout: 0.0
35
+ category_embeddings: null
36
+ dim: 768
37
+ dim_head: 64
38
+ downsample:
39
+ configs:
40
+ conv2_4x:
41
+ factor: 4
42
+ kernel_size: 5
43
+ kind: conv*2
44
+ use_layernorm: true
45
+ conv_4x:
46
+ factor: 4
47
+ kernel_size: 5
48
+ kind: conv
49
+ stage: 1
50
+ use_layernorm: true
51
+ glu_4x:
52
+ factor: 4
53
+ kernel_size: 5
54
+ kind: gluconv*2+pw
55
+ use_layernorm: true
56
+ mean:
57
+ factor: 2
58
+ kind: mean
59
+ mean_4x:
60
+ dropout: 0.0
61
+ factor: 30
62
+ kind: mean+mlp
63
+ mlp_ratio: 2.0
64
+ none:
65
+ factor: 1
66
+ kind: none
67
+ eval: mean_4x
68
+ ref: null
69
+ text: none
70
+ ff_dropout: 0.0
71
+ ff_mult: 4
72
+ freeze_audio: true
73
+ freeze_text: true
74
+ gradient_checkpointing: false
75
+ heads: 8
76
+ joint_tf_depth: 1
77
+ load_config:
78
+ checkpoint_path: null
79
+ frozen_from_pretrained: true
80
+ pretrained_name: OpenMuQ/MuQ-MuLan-large
81
+ strict: false
82
+ mlp_dim: 768
83
+ mode: concat_text_late
84
+ model_name: OpenMuQ/MuQ-MuLan-large
85
+ name: reward
86
+ no_condition: false
87
+ null_embedding:
88
+ audio:
89
+ dropout: 0.5
90
+ length: 10
91
+ lyrics:
92
+ dropout: 0.3
93
+ length: 10
94
+ text:
95
+ dropout: 0
96
+ length: 10
97
+ output_dim: 2
98
+ prompt_tf_depth: 4
99
+ sr: 24000
100
+ text_encoder:
101
+ name: muq_mulan
102
+ tune: null
103
+ text_lora_config: null
104
+ train_muq_depth: 0
105
+ train_muqmulan: false
106
+ use_audio: true
107
+ use_layer_idx: -1
108
+ project_root: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena
109
+ run_name: null
110
+ train:
111
+ batch_size: 48
112
+ betas:
113
+ - 0.9
114
+ - 0.99
115
+ ema_decay: 0.9999
116
+ ema_update_every: 1
117
+ enable_gradient_checkpointing: true
118
+ force_clear_prev_results: false
119
+ grad_accum_every: 1
120
+ log_tensorboard: true
121
+ lr_schedule:
122
+ min_lr_ratio: 0.001
123
+ name: linear_cosine
124
+ total_steps: 10000
125
+ warmup_steps: 10
126
+ max_grad_norm: 1
127
+ mlp_lr: 1.0e-05
128
+ num_train_steps: 10000
129
+ num_valid_batches: null
130
+ num_workers: 8
131
+ other_lr: 1.0e-05
132
+ resume: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/contrastive/20260123_1403_tune_mulan_transformer/ckpt/reward_model.best_27252.pt
133
+ resume_optimizer: false
134
+ save_model_every: 2000
135
+ use_checkpoint_config: true
136
+ use_ema: false
137
+ use_lion: false
138
+ valid_batch_size: 20
139
+ valid_every: 500
140
+ valid_frac: 0.1
141
+ verify_weights_on_load: true
142
+ validate_only: false
20260125_1231/eval_results_0125_1707.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
20260125_1231/reward_model/1769315504.5030606/events.out.tfevents.1769315504.MACLAB-S004.2360364.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4430ab0d26659fcc57b20fd55521428ea8d75daa98b78169e91d25ebffd673d8
3
+ size 503
20260125_1231/reward_model/1769315504.5045948/hparams.yml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ batch_size: 48
2
+ grad_accum_every: 1
3
+ learning_rate: 1.0e-05
4
+ num_train_steps: 10000
20260125_1231/reward_model/events.out.tfevents.1769315504.MACLAB-S004.2360364.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76c1b87e41adf1f3ac89056217504376dbdc359f097f179ff31512b42ce3c00f
3
+ size 5986202
20260125_1231/test_20260125_191012_reward_model.best_4499/test_results.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metrics": {
3
+ "overall": {
4
+ "total_samples": 3463,
5
+ "mq": {
6
+ "num_non_tie": 3463,
7
+ "accuracy": 0.7678313600924054,
8
+ "avg_confidence": 0.7792870429358729,
9
+ "std_confidence": 0.14544243560525433
10
+ },
11
+ "if": {
12
+ "num_non_tie": 3463,
13
+ "accuracy": 0.6996823563384349,
14
+ "avg_confidence": 0.7319046033279007,
15
+ "std_confidence": 0.1337721067466566
16
+ },
17
+ "avg_accuracy": 0.7337568582154201
18
+ },
19
+ "by_modality": {
20
+ "has_audio": {
21
+ "count": 884,
22
+ "mq_acc": 0.8122171945701357,
23
+ "if_acc": 0.7726244343891403,
24
+ "mq_conf": 0.7979760396534501,
25
+ "if_conf": 0.7548858234785262
26
+ },
27
+ "no_audio": {
28
+ "count": 2579,
29
+ "mq_acc": 0.7526172935246219,
30
+ "if_acc": 0.6746801085692129,
31
+ "mq_conf": 0.7728810432854897,
32
+ "if_conf": 0.7240273646256312
33
+ },
34
+ "has_lyrics": {
35
+ "count": 943,
36
+ "mq_acc": 0.8038176033934252,
37
+ "if_acc": 0.7592788971367974,
38
+ "mq_conf": 0.7926488271573695,
39
+ "if_conf": 0.732424895558605
40
+ },
41
+ "no_lyrics": {
42
+ "count": 2520,
43
+ "mq_acc": 0.7543650793650793,
44
+ "if_acc": 0.6773809523809524,
45
+ "mq_conf": 0.7742869784434636,
46
+ "if_conf": 0.7317099066717284
47
+ }
48
+ },
49
+ "by_score_diff": {
50
+ "score_range": {
51
+ "min":
20260125_1231/test_20260125_194533_reward_model.best_4499/test_config.yaml ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ DEVICES: '7'
2
+ accelerate:
3
+ mixed_precision: bf16
4
+ basics:
5
+ random_seed: 42
6
+ save_dir: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/reward_model
7
+ dataset:
8
+ audio_dropout:
9
+ apply_to_eval: false
10
+ apply_to_ref: true
11
+ enabled: false
12
+ eval_only_on_training: true
13
+ max_duration: 1500
14
+ min_duration: 1500
15
+ train_mode: start
16
+ cache_dir: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/tmp
17
+ db_path: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/backend/database.db
18
+ duration: 600.0
19
+ embedding_dir: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/supervised_embeddings
20
+ mode: raw_text_frozen_audio
21
+ max_samples: null
22
+ max_val_samples: null
23
+ metadata_jsonl: ${project_root}/CMI-Training/all_comparisons.jsonl
24
+ preference_file: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/CMI-Training/unbiased_qwen/train.json
25
+ sample_rate: 24000
26
+ val_preference_file: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/CMI-Training/human_annotations/train.json
27
+ loss:
28
+ IF_ratio: 0.5
29
+ filter_ties: true
30
+ label_smoothing: 0.0
31
+ reduction: mean
32
+ model:
33
+ attention_mode: SA
34
+ attn_dropout: 0.0
35
+ category_embeddings: null
36
+ dim: 768
37
+ dim_head: 64
38
+ downsample:
39
+ configs:
40
+ conv2_4x:
41
+ factor: 4
42
+ kernel_size: 5
43
+ kind: conv*2
44
+ use_layernorm: true
45
+ conv_4x:
46
+ factor: 4
47
+ kernel_size: 5
48
+ kind: conv
49
+ stage: 1
50
+ use_layernorm: true
51
+ glu_4x:
52
+ factor: 4
53
+ kernel_size: 5
54
+ kind: gluconv*2+pw
55
+ use_layernorm: true
56
+ mean:
57
+ factor: 2
58
+ kind: mean
59
+ mean_4x:
60
+ dropout: 0.0
61
+ factor: 30
62
+ kind: mean+mlp
63
+ mlp_ratio: 2.0
64
+ none:
65
+ factor: 1
66
+ kind: none
67
+ eval: mean_4x
68
+ ref: null
69
+ text: none
70
+ ff_dropout: 0.0
71
+ ff_mult: 4
72
+ freeze_audio: true
73
+ freeze_text: true
74
+ gradient_checkpointing: false
75
+ heads: 8
76
+ joint_tf_depth: 1
77
+ load_config:
78
+ checkpoint_path: null
79
+ frozen_from_pretrained: true
80
+ pretrained_name: OpenMuQ/MuQ-MuLan-large
81
+ strict: false
82
+ mlp_dim: 768
83
+ mode: concat_text_late
84
+ model_name: OpenMuQ/MuQ-MuLan-large
85
+ name: reward
86
+ null_embedding:
87
+ audio:
88
+ dropout: 0.5
89
+ length: 10
90
+ lyrics:
91
+ dropout: 0.3
92
+ length: 10
93
+ text:
94
+ dropout: 0
95
+ length: 10
96
+ output_dim: 2
97
+ prompt_tf_depth: 4
98
+ sr: 24000
99
+ text_encoder:
100
+ name: muq_mulan
101
+ tune: null
102
+ text_lora_config: null
103
+ train_muq_depth: 0
104
+ train_muqmulan: false
105
+ use_layer_idx: -1
106
+ use_audio: true
107
+ no_condition: false
108
+ project_root: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena
109
+ run_name: null
110
+ train:
111
+ batch_size: 48
112
+ betas:
113
+ - 0.9
114
+ - 0.99
115
+ ema_decay: 0.9999
116
+ ema_update_every: 1
117
+ enable_gradient_checkpointing: true
118
+ force_clear_prev_results: false
119
+ grad_accum_every: 1
120
+ log_tensorboard: true
121
+ lr_schedule:
122
+ min_lr_ratio: 0.001
123
+ name: linear_cosine
124
+ total_steps: 30000
125
+ warmup_steps: 300
126
+ max_grad_norm: 100
127
+ mlp_lr: 0.0001
128
+ num_train_steps: 30000
129
+ num_valid_batches: 10
130
+ num_workers: 8
131
+ other_lr: null
132
+ resume: /data/yrb/musicarena/Haiwen/offline_data/cmi-arena/experiments/finetune_human/20260125_1231/ckpt/reward_model.best_4499.pt
133
+ resume_optimizer: false
134
+ save_model_every: 2000
135
+ use_checkpoint_config: true
136
+ use_ema: true
137
+ use_lion: false
138
+ valid_batch_size: 20
139
+ valid_every: 2000
140
+ valid_frac: 0.1
141
+ verify_weights_on_load: true
142
+ validate_only: true
20260125_1231/test_20260125_194533_reward_model.best_4499/test_results.json ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metrics": {
3
+ "overall": {
4
+ "total_samples": 3463,
5
+ "mq": {
6
+ "num_non_tie": 3463,
7
+ "accuracy": 0.7678313600924054,
8
+ "avg_confidence": 0.7792870429358729,
9
+ "std_confidence": 0.14544243560525433
10
+ },
11
+ "if": {
12
+ "num_non_tie": 3463,
13
+ "accuracy": 0.6996823563384349,
14
+ "avg_confidence": 0.7319046033279007,
15
+ "std_confidence": 0.1337721067466566
16
+ },
17
+ "avg_accuracy": 0.7337568582154201
18
+ },
19
+ "by_modality": {
20
+ "has_audio": {
21
+ "count": 884,
22
+ "mq_acc": 0.8122171945701357,
23
+ "if_acc": 0.7726244343891403,
24
+ "mq_conf": 0.7979760396534501,
25
+ "if_conf": 0.7548858234785262
26
+ },
27
+ "no_audio": {
28
+ "count": 2579,
29
+ "mq_acc": 0.7526172935246219,
30
+ "if_acc": 0.6746801085692129,
31
+ "mq_conf": 0.7728810432854897,
32
+ "if_conf": 0.7240273646256312
33
+ },
34
+ "has_lyrics": {
35
+ "count": 943,
36
+ "mq_acc": 0.8038176033934252,
37
+ "if_acc": 0.7592788971367974,
38
+ "mq_conf": 0.7926488271573695,
39
+ "if_conf": 0.732424895558605
40
+ },
41
+ "no_lyrics": {
42
+ "count": 2520,
43
+ "mq_acc": 0.7543650793650793,
44
+ "if_acc": 0.6773809523809524,
45
+ "mq_conf": 0.7742869784434636,
46
+ "if_conf": 0.7317099066717284
47
+ }
48
+ },
49
+ "by_score_diff": {
50
+ "score_range": {
51
+ "min": 0.0,
52
+ "max": 4.0
53
+ },
54
+ "0-1": {
55
+ "count": 59,
56
+ "mq_acc": 0.6610169491525424,
57
+ "if_acc": 0.6271186440677966,
58
+ "mq_conf": 0.6943881915787519,
59
+ "if_conf": 0.6363525047140607
60
+ },
61
+ "1-2": {
62
+ "count": 367,
63
+ "mq_acc": 0.6512261580381471,
64
+ "if_acc": 0.5858310626702997,
65
+ "mq_conf": 0.7223088202099709,
66
+ "if_conf": 0.674628816443503
67
+ },
68
+ "2-3": {
69
+ "count": 1192,
70
+ "mq_acc": 0.7374161073825504,
71
+ "if_acc": 0.6459731543624161,
72
+ "mq_conf": 0.7660567649958918,
73
+ "if_conf": 0.7183731070800916
74
+ },
75
+ "3+": {
76
+ "count": 1845,
77
+ "mq_acc": 0.8140921409214092,
78
+ "if_acc": 0.759349593495935,
79
+ "mq_conf": 0.8018835368518261,
80
+ "if_conf": 0.7550955687111955
81
+ }
82
+ },
83
+ "by_duration": {
84
+ "0-30s": {
85
+ "count": 1097,
86
+ "mq_acc": 0.7529626253418414,
87
+ "if_acc": 0.6435733819507748,
88
+ "mq_conf": 0.7710004717301757,
89
+ "if_conf": 0.7230207648403338
90
+ },
91
+ "30-60s": {
92
+ "count": 1007,
93
+ "mq_acc": 0.7864945382323734,
94
+ "if_acc": 0.7149950347567031,
95
+ "mq_conf": 0.8106962935453376,
96
+ "if_conf": 0.7519949673422517
97
+ },
98
+ "60-90s": {
99
+ "count": 741,
100
+ "mq_acc": 0.7840755735492577,
101
+ "if_acc": 0.7651821862348178,
102
+ "mq_conf": 0.7830080420542986,
103
+ "if_conf": 0.7454769649164558
104
+ },
105
+ "90-120s": {
106
+ "count": 12,
107
+ "mq_acc": 0.6666666666666666,
108
+ "if_acc": 0.5,
109
+ "mq_conf": 0.7591231515010198,
110
+ "if_conf": 0.7100299447774887
111
+ },
112
+ "120s+": {
113
+ "count": 606,
114
+ "mq_acc": 0.7458745874587459,
115
+ "if_acc": 0.6996699669966997,
116
+ "mq_conf": 0.7379437419447569,
117
+ "if_conf": 0.6984391746544601
118
+ }
119
+ },
120
+ "model_pairs": {
121
+ "total_pairs": 128,
122
+ "valid_pairs": 45,
123
+ "min_count_threshold": 10,
124
+ "top_5": [
125
+ {
126
+ "pair": "jamify vs suno-v4.5-plus",
127
+ "count": 13,
128
+ "mq_acc": 1.0,
129
+ "if_acc": 0.9230769230769231,
130
+ "avg_acc": 0.9615384615384616,
131
+ "mq_conf": 0.8753359088530908,
132
+ "if_conf": 0.8881901227510892
133
+ },
134
+ {
135
+ "pair": "jamify vs suno-v4",
136
+ "count": 11,
137
+ "mq_acc": 1.0,
138
+ "if_acc": 0.9090909090909091,
139
+ "avg_acc": 0.9545454545454546,
140
+ "mq_conf": 0.8784825205802917,
141
+ "if_conf": 0.8930827325040643
142
+ },
143
+ {
144
+ "pair": "audioldm2-music vs magenta-rt-large",
145
+ "count": 116,
146
+ "mq_acc": 0.9224137931034483,
147
+ "if_acc": 0.9137931034482759,
148
+ "avg_acc": 0.9181034482758621,
149
+ "mq_conf": 0.8762899652637285,
150
+ "if_conf": 0.8524722950211887
151
+ },
152
+ {
153
+ "pair": "jamify vs levo",
154
+ "count": 65,
155
+ "mq_acc": 0.9538461538461539,
156
+ "if_acc": 0.8769230769230769,
157
+ "avg_acc": 0.9153846153846155,
158
+ "mq_conf": 0.8298323347018315,
159
+ "if_conf": 0.7724327931037316
160
+ },
161
+ {
162
+ "pair": "jamify vs suno-v3.5",
163
+ "count": 27,
164
+ "mq_acc": 0.9629629629629629,
165
+ "if_acc": 0.8518518518518519,
166
+ "avg_acc": 0.9074074074074074,
167
+ "mq_conf": 0.8570071635422883,
168
+ "if_conf": 0.8533604873551263
169
+ }
170
+ ],
171
+ "bottom_5": [
172
+ {
173
+ "pair": "audioldm vs sao",
174
+ "count": 12,
175
+ "mq_acc": 0.5,
176
+ "if_acc": 0.5,
177
+ "avg_acc": 0.5,
178
+ "mq_conf": 0.6995708495378494,
179
+ "if_conf": 0.6728040178616842
180
+ },
181
+ {
182
+ "pair": "audioldm2-music vs sao-small",
183
+ "count": 20,
184
+ "mq_acc": 0.6,
185
+ "if_acc": 0.45,
186
+ "avg_acc": 0.525,
187
+ "mq_conf": 0.7222894936800003,
188
+ "if_conf": 0.6842079430818557
189
+ },
190
+ {
191
+ "pair": "sao vs sao-small",
192
+ "count": 18,
193
+ "mq_acc": 0.5555555555555556,
194
+ "if_acc": 0.5555555555555556,
195
+ "avg_acc": 0.5555555555555556,
196
+ "mq_conf": 0.7228857609960768,
197
+ "if_conf": 0.680361701382531
198
+ },
199
+ {
200
+ "pair": "suno-v3.5 vs suno-v5",
201
+ "count": 10,
202
+ "mq_acc": 0.7,
203
+ "if_acc": 0.5,
204
+ "avg_acc": 0.6,
205
+ "mq_conf": 0.6346197962760926,
206
+ "if_conf": 0.6014198660850525
207
+ },
208
+ {
209
+ "pair": "magenta-rt-large vs sao-small",
210
+ "count": 16,
211
+ "mq_acc": 0.6875,
212
+ "if_acc": 0.5625,
213
+ "avg_acc": 0.625,
214
+ "mq_conf": 0.8538035452365875,
215
+ "if_conf": 0.8277972266077995
216
+ }
217
+ ]
218
+ },
219
+ "alignment": {
220
+ "total_non_tie": 3463,
221
+ "agreement_rate": 0.9347386658966215,
222
+ "agree": {
223
+ "count": 3237,
224
+ "mq_acc": 0.788693234476367,
225
+ "if_acc": 0.7055915971578622,
226
+ "mq_conf": 0.7936204567454942,
227
+ "if_conf": 0.7445049513111816
228
+ },
229
+ "disagree": {
230
+ "count": 226,
231
+ "mq_acc": 0.4690265486725664,
232
+ "if_acc": 0.6150442477876106,
233
+ "mq_conf": 0.5739894300962971,
234
+ "if_conf": 0.5514297076558645
235
+ }
236
+ }
237
+ },
238
+ "summary": "======================================================================\nTEST METRICS SUMMARY\n======================================================================\n\n[Overall] Total: 3463 samples\n MQ: Acc=0.7678, Conf=0.7793 ± 0.1454 (n=3463)\n IF: Acc=0.6997, Conf=0.7319 ± 0.1338 (n=3463)\n Avg Acc: 0.7338\n\n[By Prompt Modality]\n has_audio : n= 884, MQ_acc=0.8122, IF_acc=0.7726, MQ_conf=0.7980, IF_conf=0.7549\n no_audio : n= 2579, MQ_acc=0.7526, IF_acc=0.6747, MQ_conf=0.7729, IF_conf=0.7240\n has_lyrics : n= 943, MQ_acc=0.8038, IF_acc=0.7593, MQ_conf=0.7926, IF_conf=0.7324\n no_lyrics : n= 2520, MQ_acc=0.7544, IF_acc=0.6774, MQ_conf=0.7743, IF_conf=0.7317\n\n[By Score Difference (data confidence bins)]\n Score range: [0.00, 4.00]\n 0-1 : n= 59, MQ_acc=0.6610, IF_acc=0.6271, MQ_conf=0.6944, IF_conf=0.6364\n 1-2 : n= 367, MQ_acc=0.6512, IF_acc=0.5858, MQ_conf=0.7223, IF_conf=0.6746\n 2-3 : n= 1192, MQ_acc=0.7374, IF_acc=0.6460, MQ_conf=0.7661, IF_conf=0.7184\n 3+ : n= 1845, MQ_acc=0.8141, IF_acc=0.7593, MQ_conf=0.8019, IF_conf=0.7551\n\n[By Duration (gen_a + gen_b)]\n 0-30s : n= 1097, MQ_acc=0.7530, IF_acc=0.6436, MQ_conf=0.7710, IF_conf=0.7230\n 30-60s : n= 1007, MQ_acc=0.7865, IF_acc=0.7150, MQ_conf=0.8107, IF_conf=0.7520\n 60-90s : n= 741, MQ_acc=0.7841, IF_acc=0.7652, MQ_conf=0.7830, IF_conf=0.7455\n 90-120s : n= 12, MQ_acc=0.6667, IF_acc=0.5000, MQ_conf=0.7591, IF_conf=0.7100\n 120s+ : n= 606, MQ_acc=0.7459, IF_acc=0.6997, MQ_conf=0.7379, IF_conf=0.6984\n\n[Model Pairs] Total: 128 unique pairs\n Top 5 (by avg acc):\n jamify vs suno-v4.5-plus : n= 13, MQ=1.0000, IF=0.9231, Avg=0.9615, Conf(MQ/IF)=0.875/0.888\n jamify vs suno-v4 : n= 11, MQ=1.0000, IF=0.9091, Avg=0.9545, Conf(MQ/IF)=0.878/0.893\n audioldm2-music vs magenta-rt-large : n= 116, MQ=0.9224, IF=0.9138, Avg=0.9181, Conf(MQ/IF)=0.876/0.852\n jamify vs levo : n= 65, MQ=0.9538, IF=0.8769, Avg=0.9154, Conf(MQ/IF)=0.830/0.772\n jamify vs suno-v3.5 : n= 27, MQ=0.9630, IF=0.8519, Avg=0.9074, Conf(MQ/IF)=0.857/0.853\n Bottom 5:\n audioldm vs sao : n= 12, MQ=0.5000, IF=0.5000, Avg=0.5000, Conf(MQ/IF)=0.700/0.673\n audioldm2-music vs sao-small : n= 20, MQ=0.6000, IF=0.4500, Avg=0.5250, Conf(MQ/IF)=0.722/0.684\n sao vs sao-small : n= 18, MQ=0.5556, IF=0.5556, Avg=0.5556, Conf(MQ/IF)=0.723/0.680\n suno-v3.5 vs suno-v5 : n= 10, MQ=0.7000, IF=0.5000, Avg=0.6000, Conf(MQ/IF)=0.635/0.601\n magenta-rt-large vs sao-small : n= 16, MQ=0.6875, IF=0.5625, Avg=0.6250, Conf(MQ/IF)=0.854/0.828\n\n[MQ vs IF Alignment]\n Agreement rate: 0.9347 (3463 non-tie samples)\n When agree (n= 3237): MQ_acc=0.7887, IF_acc=0.7056, MQ_conf=0.7936, IF_conf=0.7445\n When disagree(n= 226): MQ_acc=0.4690, IF_acc=0.6150, MQ_conf=0.5740, IF_conf=0.5514\n======================================================================"
239
+ }