diff --git "a/lm_eval_logs/driver_nohup.log" "b/lm_eval_logs/driver_nohup.log"
new file mode 100644--- /dev/null
+++ "b/lm_eval_logs/driver_nohup.log"
@@ -0,0 +1,1213 @@
+[INFO] override: GPU_ID=1
+[INFO] override: RUN_BG=1
+[INFO] override: MUON_LR=2.6e-3
+[INFO] override: ALPHA=1200
+[INFO] override: RUN_TRAIN=1
+[INFO] override: STEERING_COEFFS=15
+[INFO] OUTPUT_DIR      = models/muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6
+[INFO] PRETRAINED_PATH = /egr/research-optml/wangc168/Muon_wmdp/rmu_wmdp/models/muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6
+[INFO] LOG_DIR         = /egr/research-optml/wangc168/Muon_wmdp/rmu_wmdp/models/muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/lm_eval_logs
+[INFO] GPU_ID          = 1 (CUDA_VISIBLE_DEVICES=1 => device cuda:0)
+[INFO] WAIT_SEC        = 300 (seconds)
+[INFO] MAX_RETRIES     = 10 (0 means infinite)
+[INFO] training...
+/egr/research-optml/wangc168/anaconda3/envs/py310/lib/python3.10/site-packages/transformers/utils/hub.py:127: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
+  warnings.warn(
+Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]Downloading shards: 100%|██████████| 8/8 [00:00<00:00, 2715.42it/s]
+Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]Loading checkpoint shards:  12%|█▎        | 1/8 [00:00<00:02,  2.59it/s]Loading checkpoint shards:  25%|██▌       | 2/8 [00:00<00:02,  2.66it/s]Loading checkpoint shards:  38%|███▊      | 3/8 [00:01<00:01,  2.55it/s]Loading checkpoint shards:  50%|█████     | 4/8 [00:01<00:01,  2.57it/s]Loading checkpoint shards:  62%|██████▎   | 5/8 [00:01<00:01,  2.51it/s]Loading checkpoint shards:  75%|███████▌  | 6/8 [00:02<00:00,  2.50it/s]Loading checkpoint shards:  88%|████████▊ | 7/8 [00:02<00:00,  2.48it/s]Loading checkpoint shards: 100%|██████████| 8/8 [00:03<00:00,  2.82it/s]Loading checkpoint shards: 100%|██████████| 8/8 [00:03<00:00,  2.64it/s]
+Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]Downloading shards: 100%|██████████| 8/8 [00:00<00:00, 3622.02it/s]
+Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]Loading checkpoint shards:  12%|█▎        | 1/8 [00:00<00:02,  2.70it/s]Loading checkpoint shards:  25%|██▌       | 2/8 [00:00<00:02,  2.74it/s]Loading checkpoint shards:  38%|███▊      | 3/8 [00:01<00:01,  2.71it/s]Loading checkpoint shards:  50%|█████     | 4/8 [00:01<00:01,  2.66it/s]Loading checkpoint shards:  62%|██████▎   | 5/8 [00:01<00:01,  2.55it/s]Loading checkpoint shards:  75%|███████▌  | 6/8 [00:02<00:00,  2.53it/s]Loading checkpoint shards:  88%|████████▊ | 7/8 [00:02<00:00,  2.50it/s]Loading checkpoint shards: 100%|██████████| 8/8 [00:02<00:00,  2.92it/s]Loading checkpoint shards: 100%|██████████| 8/8 [00:02<00:00,  2.72it/s]
+wandb: Currently logged in as: wangc168 (wcs23187) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
+wandb: setting up run kstsc84q
+wandb: Tracking run with wandb version 0.23.1
+wandb: Run data is saved locally in /egr/research-optml/wangc168/Muon_wmdp/rmu_wmdp/wandb/run-20260403_161129-kstsc84q
+wandb: Run `wandb offline` to turn off syncing.
+wandb: Syncing run muon-v1-rank-models/muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6
+wandb: ⭐️ View project at https://wandb.ai/wcs23187/Muon-constraint
+wandb: 🚀 View run at https://wandb.ai/wcs23187/Muon-constraint/runs/kstsc84q
+====rmu Config====
+model_name_or_path=HuggingFaceH4/zephyr-7b-beta
+module_str={model_name}.model.layers[{layer_id}]
+output_dir=models/muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6
+retain_corpora=['wikitext', 'wikitext']
+forget_corpora=['bio_remove_dataset', 'cyber-forget-corpus']
+alpha=[1200.0, 1200.0]
+steering_coeffs=15,15
+muon_lr=0.0026
+muon_momentum=0.95
+muon_weight_decay=0.1
+muon_ns_steps=5
+muon_nesterov=True
+adam_emb_lr=5e-05
+adam_scalar_lr=5e-05
+enable_muon_svd_logging=False
+M_stats_path=
+M_check_T=0
+V1_K=64
+min_len=0
+max_len=2000
+batch_size=4
+max_num_batches=150
+layer_id=7
+layer_ids=[5, 6, 7]
+param_ids=[6]
+seed=42
+verbose=False
+steering_coeff_list=[15.0, 15.0]
+=====
+MUON model.layers.5.mlp.down_proj.weight (4096, 14336)
+MUON model.layers.6.mlp.down_proj.weight (4096, 14336)
+MUON model.layers.7.mlp.down_proj.weight (4096, 14336)
+======= Epoch 0 =======
+  0%|          | 0/150 [00:00<?, ?it/s]We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
+/egr/research-optml/wangc168/Muon_wmdp/rmu_wmdp/rmu_wb/unlearn_muon_v1.py:201: UserWarning: Using a target size (torch.Size([1, 1, 4096])) that is different to the input size (torch.Size([4, 512, 4096])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.
+  unlearn_loss = torch.nn.functional.mse_loss(
+  1%|          | 1/150 [00:04<12:20,  4.97s/it]/egr/research-optml/wangc168/Muon_wmdp/rmu_wmdp/rmu_wb/unlearn_muon_v1.py:201: UserWarning: Using a target size (torch.Size([1, 1, 4096])) that is different to the input size (torch.Size([4, 768, 4096])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.
+  unlearn_loss = torch.nn.functional.mse_loss(
+  1%|▏         | 2/150 [00:13<16:51,  6.84s/it]  2%|▏         | 3/150 [00:21<18:04,  7.38s/it]  3%|▎         | 4/150 [00:29<18:34,  7.63s/it]  3%|▎         | 5/150 [00:37<18:45,  7.76s/it]  4%|▍         | 6/150 [00:45<18:48,  7.84s/it]  5%|▍         | 7/150 [00:53<18:48,  7.89s/it]  5%|▌         | 8/150 [01:01<18:43,  7.91s/it]  6%|▌         | 9/150 [01:09<18:40,  7.94s/it]  7%|▋         | 10/150 [01:17<18:32,  7.95s/it]  7%|▋         | 11/150 [01:25<18:28,  7.97s/it]  8%|▊         | 12/150 [01:33<18:21,  7.98s/it]  9%|▊         | 13/150 [01:41<18:15,  8.00s/it]  9%|▉         | 14/150 [01:49<18:10,  8.02s/it] 10%|█         | 15/150 [01:57<17:57,  7.98s/it] 11%|█         | 16/150 [02:04<17:46,  7.96s/it] 11%|█▏        | 17/150 [02:12<17:36,  7.95s/it] 12%|█▏        | 18/150 [02:20<17:29,  7.95s/it] 13%|█▎        | 19/150 [02:28<17:19,  7.94s/it] 13%|█▎        | 20/150 [02:36<17:10,  7.93s/it] 14%|█▍        | 21/150 [02:44<17:02,  7.92s/it] 15%|█▍        | 22/150 [02:52<16:55,  7.94s/it] 15%|█▌        | 23/150 [03:00<16:47,  7.93s/it] 16%|█▌        | 24/150 [03:08<16:37,  7.92s/it] 17%|█▋        | 25/150 [03:16<16:33,  7.94s/it] 17%|█▋        | 26/150 [03:24<16:24,  7.94s/it] 18%|█▊        | 27/150 [03:32<16:17,  7.94s/it] 19%|█▊        | 28/150 [03:40<16:09,  7.95s/it] 19%|█▉        | 29/150 [03:48<15:59,  7.93s/it] 20%|██        | 30/150 [03:56<15:50,  7.92s/it] 21%|██        | 31/150 [04:04<15:45,  7.95s/it] 21%|██▏       | 32/150 [04:11<15:37,  7.94s/it] 22%|██▏       | 33/150 [04:19<15:29,  7.94s/it] 23%|██▎       | 34/150 [04:27<15:21,  7.95s/it] 23%|██▎       | 35/150 [04:35<15:17,  7.98s/it] 24%|██▍       | 36/150 [04:43<15:07,  7.96s/it] 25%|██▍       | 37/150 [04:51<15:00,  7.97s/it] 25%|██▌       | 38/150 [04:59<14:53,  7.98s/it] 26%|██▌       | 39/150 [05:07<14:41,  7.94s/it] 27%|██▋       | 40/150 [05:15<14:33,  7.94s/it] 27%|██▋       | 41/150 [05:23<14:26,  7.95s/it] 28%|██▊       | 42/150 [05:31<14:19,  7.96s/it] 29%|██▊       | 43/150 [05:39<14:10,  7.95s/it] 29%|██▉       | 44/150 [05:47<14:02,  7.95s/it] 30%|███       | 45/150 [05:55<13:54,  7.95s/it] 31%|███       | 46/150 [06:03<13:46,  7.95s/it] 31%|███▏      | 47/150 [06:11<13:46,  8.02s/it] 32%|███▏      | 48/150 [06:19<13:33,  7.97s/it] 33%|███▎      | 49/150 [06:27<13:22,  7.95s/it] 33%|███▎      | 50/150 [06:35<13:15,  7.96s/it] 34%|███▍      | 51/150 [06:43<13:05,  7.94s/it] 35%|███▍      | 52/150 [06:51<12:56,  7.93s/it] 35%|███▌      | 53/150 [06:58<12:49,  7.93s/it] 36%|███▌      | 54/150 [07:06<12:41,  7.93s/it] 37%|███▋      | 55/150 [07:14<12:33,  7.94s/it] 37%|███▋      | 56/150 [07:22<12:26,  7.94s/it] 38%|███▊      | 57/150 [07:30<12:18,  7.95s/it] 39%|███▊      | 58/150 [07:38<12:12,  7.96s/it] 39%|███▉      | 59/150 [07:46<12:05,  7.97s/it] 40%|████      | 60/150 [07:54<11:59,  7.99s/it] 41%|████      | 61/150 [08:02<11:52,  8.00s/it]now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1377 | unlearn_loss: 0.1377 | retain_loss: 0 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.0791 | unlearn_loss: 0.0791 | retain_loss: 4.816e-05 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1426 | unlearn_loss: 0.1416 | retain_loss: 0.001022 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.07959 | unlearn_loss: 0.0791 | retain_loss: 0.0006828 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1387 | unlearn_loss: 0.1377 | retain_loss: 0.0008354 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.07959 | unlearn_loss: 0.07861 | retain_loss: 0.0009842 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1357 | unlearn_loss: 0.1357 | retain_loss: 0.0004139 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.07959 | unlearn_loss: 0.07861 | retain_loss: 0.0009918 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1416 | unlearn_loss: 0.1406 | retain_loss: 0.0009003 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.0791 | unlearn_loss: 0.07861 | retain_loss: 0.0004692 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1377 | unlearn_loss: 0.1367 | retain_loss: 0.001373 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.0791 | unlearn_loss: 0.07861 | retain_loss: 0.0004501 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1377 | unlearn_loss: 0.1367 | retain_loss: 0.0005684 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.0791 | unlearn_loss: 0.07861 | retain_loss: 0.0007057 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1436 | unlearn_loss: 0.1396 | retain_loss: 0.003998 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.08008 | unlearn_loss: 0.07861 | retain_loss: 0.001457 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1377 | unlearn_loss: 0.1367 | retain_loss: 0.001244 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.08008 | unlearn_loss: 0.07861 | retain_loss: 0.001556 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1396 | unlearn_loss: 0.1387 | retain_loss: 0.001022 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.0791 | unlearn_loss: 0.07812 | retain_loss: 0.0009079 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1406 | unlearn_loss: 0.1377 | retain_loss: 0.003036 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.07959 | unlearn_loss: 0.07812 | retain_loss: 0.001228 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1406 | unlearn_loss: 0.1387 | retain_loss: 0.001526 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.0791 | unlearn_loss: 0.07812 | retain_loss: 0.0009193 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1406 | unlearn_loss: 0.1396 | retain_loss: 0.001152 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.0791 | unlearn_loss: 0.07812 | retain_loss: 0.001007 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1387 | unlearn_loss: 0.1357 | retain_loss: 0.003036 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.08008 | unlearn_loss: 0.07812 | retain_loss: 0.001801 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1396 | unlearn_loss: 0.1367 | retain_loss: 0.002579 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.0791 | unlearn_loss: 0.07764 | retain_loss: 0.001518 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1377 | unlearn_loss: 0.1367 | retain_loss: 0.001404 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.07861 | unlearn_loss: 0.07764 | retain_loss: 0.001099 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1387 | unlearn_loss: 0.1357 | retain_loss: 0.002899 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.08008 | unlearn_loss: 0.07812 | retain_loss: 0.002121 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1406 | unlearn_loss: 0.1387 | retain_loss: 0.001907 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.07959 | unlearn_loss: 0.07812 | retain_loss: 0.001358 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1416 | unlearn_loss: 0.1367 | retain_loss: 0.004425 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.08057 | unlearn_loss: 0.07812 | retain_loss: 0.002625 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1426 | unlearn_loss: 0.1396 | retain_loss: 0.002457 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.07959 | unlearn_loss: 0.07764 | retain_loss: 0.001823 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1416 | unlearn_loss: 0.1367 | retain_loss: 0.004456 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.08008 | unlearn_loss: 0.07715 | retain_loss: 0.002975 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1455 | unlearn_loss: 0.1416 | retain_loss: 0.003616 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.07959 | unlearn_loss: 0.07715 | retain_loss: 0.002594 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1416 | unlearn_loss: 0.1387 | retain_loss: 0.002579 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.0791 | unlearn_loss: 0.07715 | retain_loss: 0.002182 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1445 | unlearn_loss: 0.1396 | retain_loss: 0.004578 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.08057 | unlearn_loss: 0.07764 | retain_loss: 0.002914 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1416 | unlearn_loss: 0.1377 | retain_loss: 0.00351 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.07959 | unlearn_loss: 0.07715 | retain_loss: 0.002396 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1396 | unlearn_loss: 0.1367 | retain_loss: 0.003021 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.07861 | unlearn_loss: 0.07666 | retain_loss: 0.001945 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1377 | unlearn_loss: 0.1348 | retain_loss: 0.002563 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.0791 | unlearn_loss: 0.07715 | retain_loss: 0.001762 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1416 | unlearn_loss: 0.1338 | retain_loss: 0.00824 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.08252 | unlearn_loss: 0.07617 | retain_loss: 0.006256 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1279 | unlearn_loss: 0.1226 | retain_loss: 0.005066 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.08057 | unlearn_loss: 0.07617 | retain_loss: 0.004272 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1406 | unlearn_loss: 0.1309 | retain_loss: 0.009583 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.08447 | unlearn_loss: 0.07617 | retain_loss: 0.008179 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1396 | unlearn_loss: 0.1309 | retain_loss: 0.008667 | param_change: 0
+now we use  the mode ..  v1 41%|████▏     | 62/150 [08:10<11:45,  8.02s/it] 42%|████▏     | 63/150 [08:18<11:38,  8.03s/it] 43%|████▎     | 64/150 [08:26<11:31,  8.05s/it] 43%|████▎     | 65/150 [08:34<11:21,  8.01s/it] 44%|████▍     | 66/150 [08:42<11:12,  8.01s/it] 45%|████▍     | 67/150 [08:50<11:03,  7.99s/it] 45%|████▌     | 68/150 [08:58<10:54,  7.98s/it] 46%|████▌     | 69/150 [09:06<10:46,  7.98s/it] 47%|████▋     | 70/150 [09:14<10:37,  7.97s/it] 47%|████▋     | 71/150 [09:22<10:29,  7.96s/it] 48%|████▊     | 72/150 [09:30<10:21,  7.96s/it] 49%|████▊     | 73/150 [09:38<10:14,  7.98s/it] 49%|████▉     | 74/150 [09:46<10:04,  7.95s/it] 50%|█████     | 75/150 [09:54<09:59,  7.99s/it] 51%|█████     | 76/150 [10:02<09:51,  7.99s/it] 51%|█████▏    | 77/150 [10:10<09:41,  7.97s/it] 52%|█████▏    | 78/150 [10:18<09:32,  7.95s/it] 53%|█████▎    | 79/150 [10:26<09:24,  7.95s/it]/egr/research-optml/wangc168/Muon_wmdp/rmu_wmdp/rmu_wb/unlearn_muon_v1.py:201: UserWarning: Using a target size (torch.Size([1, 1, 4096])) that is different to the input size (torch.Size([4, 679, 4096])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.
+  unlearn_loss = torch.nn.functional.mse_loss(
+ 53%|█████▎    | 80/150 [10:34<09:15,  7.94s/it] 54%|█████▍    | 81/150 [10:42<09:09,  7.97s/it] 55%|█████▍    | 82/150 [10:50<09:01,  7.97s/it] 55%|█████▌    | 83/150 [10:58<08:53,  7.96s/it] 56%|█████▌    | 84/150 [11:06<08:44,  7.95s/it] 57%|█████▋    | 85/150 [11:14<08:37,  7.96s/it] 57%|█████▋    | 86/150 [11:22<08:29,  7.96s/it] 58%|█████▊    | 87/150 [11:30<08:20,  7.95s/it] 59%|█████▊    | 88/150 [11:38<08:12,  7.94s/it] 59%|█████▉    | 89/150 [11:45<08:04,  7.94s/it] 60%|██████    | 90/150 [11:53<07:56,  7.94s/it] 61%|██████    | 91/150 [12:01<07:48,  7.94s/it] 61%|██████▏   | 92/150 [12:09<07:40,  7.95s/it] 62%|██████▏   | 93/150 [12:17<07:32,  7.93s/it] 63%|██████▎   | 94/150 [12:25<07:23,  7.92s/it] 63%|██████▎   | 95/150 [12:33<07:17,  7.95s/it] 64%|██████▍   | 96/150 [12:41<07:09,  7.95s/it] 65%|██████▍   | 97/150 [12:49<07:02,  7.97s/it] 65%|██████▌   | 98/150 [12:57<06:54,  7.98s/it] 66%|██████▌   | 99/150 [13:05<06:45,  7.96s/it] 67%|██████▋   | 100/150 [13:13<06:38,  7.98s/it] 67%|██████���   | 101/150 [13:21<06:30,  7.96s/it] 68%|██████▊   | 102/150 [13:29<06:22,  7.97s/it] 69%|██████▊   | 103/150 [13:37<06:14,  7.97s/it] 69%|██████▉   | 104/150 [13:45<06:06,  7.96s/it] 70%|███████   | 105/150 [13:53<05:58,  7.97s/it] 71%|███████   | 106/150 [14:01<05:51,  7.99s/it] 71%|███████▏  | 107/150 [14:09<05:43,  7.98s/it] 72%|███████▏  | 108/150 [14:17<05:35,  8.00s/it] 73%|███████▎  | 109/150 [14:25<05:26,  7.97s/it] 73%|███████▎  | 110/150 [14:33<05:23,  8.08s/it] 74%|███████▍  | 111/150 [14:41<05:12,  8.02s/it] 75%|███████▍  | 112/150 [14:49<05:04,  8.00s/it] 75%|███████▌  | 113/150 [14:57<04:55,  7.99s/it] 76%|███████▌  | 114/150 [15:05<04:47,  7.98s/it] 77%|███████▋  | 115/150 [15:13<04:38,  7.97s/it] 77%|███████▋  | 116/150 [15:21<04:31,  7.98s/it] 78%|███████▊  | 117/150 [15:29<04:23,  7.99s/it] 79%|███████▊  | 118/150 [15:37<04:16,  8.00s/it] 79%|███████▉  | 119/150 [15:45<04:07,  7.99s/it] 80%|████████  | 120/150 [15:53<03:59,  7.98s/it] 81%|████████  | 121/150 [16:01<03:51,  7.98s/it] 81%|████████▏ | 122/150 [16:09<03:43,  7.98s/it]
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.08398 | unlearn_loss: 0.07666 | retain_loss: 0.007538 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1416 | unlearn_loss: 0.1328 | retain_loss: 0.008362 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.08301 | unlearn_loss: 0.07568 | retain_loss: 0.007507 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1426 | unlearn_loss: 0.1377 | retain_loss: 0.004974 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.07959 | unlearn_loss: 0.0752 | retain_loss: 0.004547 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1367 | unlearn_loss: 0.1318 | retain_loss: 0.004517 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.07812 | unlearn_loss: 0.07422 | retain_loss: 0.00386 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1318 | unlearn_loss: 0.127 | retain_loss: 0.004547 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.07861 | unlearn_loss: 0.07471 | retain_loss: 0.003769 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1309 | unlearn_loss: 0.127 | retain_loss: 0.00415 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.07764 | unlearn_loss: 0.07373 | retain_loss: 0.003708 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1289 | unlearn_loss: 0.125 | retain_loss: 0.004395 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.07666 | unlearn_loss: 0.07275 | retain_loss: 0.003891 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1357 | unlearn_loss: 0.1279 | retain_loss: 0.007477 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.07715 | unlearn_loss: 0.07178 | retain_loss: 0.005463 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1318 | unlearn_loss: 0.126 | retain_loss: 0.005585 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.07568 | unlearn_loss: 0.07129 | retain_loss: 0.004608 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1289 | unlearn_loss: 0.1245 | retain_loss: 0.004852 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.07764 | unlearn_loss: 0.07324 | retain_loss: 0.004211 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.124 | unlearn_loss: 0.1196 | retain_loss: 0.004608 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.07471 | unlearn_loss: 0.07031 | retain_loss: 0.004242 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.126 | unlearn_loss: 0.1196 | retain_loss: 0.006012 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.07422 | unlearn_loss: 0.06934 | retain_loss: 0.004913 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1309 | unlearn_loss: 0.127 | retain_loss: 0.004333 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.07275 | unlearn_loss: 0.06885 | retain_loss: 0.003906 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1216 | unlearn_loss: 0.1177 | retain_loss: 0.003967 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.07129 | unlearn_loss: 0.06787 | retain_loss: 0.003448 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1245 | unlearn_loss: 0.1211 | retain_loss: 0.003632 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.07129 | unlearn_loss: 0.06787 | retain_loss: 0.003189 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1162 | unlearn_loss: 0.1128 | retain_loss: 0.003662 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.07178 | unlearn_loss: 0.06885 | retain_loss: 0.003113 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.125 | unlearn_loss: 0.1191 | retain_loss: 0.006256 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.0708 | unlearn_loss: 0.06592 | retain_loss: 0.004822 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1172 | unlearn_loss: 0.1113 | retain_loss: 0.005798 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.07324 | unlearn_loss: 0.06836 | retain_loss: 0.004913 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.126 | unlearn_loss: 0.1206 | retain_loss: 0.005646 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.07178 | unlearn_loss: 0.06689 | retain_loss: 0.004944 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.124 | unlearn_loss: 0.1196 | retain_loss: 0.004517 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.06787 | unlearn_loss: 0.06396 | retain_loss: 0.00386 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1191 | unlearn_loss: 0.1128 | retain_loss: 0.006439 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.06982 | unlearn_loss: 0.06445 | retain_loss: 0.005249 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1167 | unlearn_loss: 0.1123 | retain_loss: 0.004242 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.06592 | unlearn_loss: 0.0625 | retain_loss: 0.003296 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1104 | unlearn_loss: 0.1055 | retain_loss: 0.0047 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.06689 | unlearn_loss: 0.06299 | retain_loss: 0.003937 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.106 | unlearn_loss: 0.1021 | retain_loss: 0.004059 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.06177 | unlearn_loss: 0.05811 | retain_loss: 0.003555 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1157 | unlearn_loss: 0.1113 | retain_loss: 0.004517 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.06079 | unlearn_loss: 0.05688 | retain_loss: 0.00383 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.105 | unlearn_loss: 0.1011 | retain_loss: 0.00412 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.06055 | unlearn_loss: 0.05688 | retain_loss: 0.00354 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1094 | unlearn_loss: 0.105 | retain_loss: 0.004242 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.06006 | unlearn_loss: 0.05615 | retain_loss: 0.003815 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.104 | unlearn_loss: 0.1001 | retain_loss: 0.003906 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.05859 | unlearn_loss: 0.05518 | retain_loss: 0.003357 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1128 | unlearn_loss: 0.1084 | retain_loss: 0.004456 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.05664 | unlearn_loss: 0.05298 | retain_loss: 0.003769 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1123 | unlearn_loss: 0.1084 | retain_loss: 0.003708 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.05615 | unlearn_loss: 0.05298 | retain_loss: 0.003235 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.124 | unlearn_loss: 0.1191 | retain_loss: 0.004852 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.05591 | unlearn_loss: 0.05176 | retain_loss: 0.004211 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+ 82%|████████▏ | 123/150 [16:17<03:35,  7.97s/it] 83%|████████▎ | 124/150 [16:25<03:27,  7.99s/it] 83%|████████▎ | 125/150 [16:33<03:19,  7.99s/it] 84%|████████▍ | 126/150 [16:41<03:11,  7.98s/it] 85%|████████▍ | 127/150 [16:49<03:03,  7.96s/it] 85%|████████▌ | 128/150 [16:57<02:55,  7.98s/it] 86%|████████▌ | 129/150 [17:04<02:47,  7.96s/it] 87%|████████▋ | 130/150 [17:12<02:39,  7.96s/it] 87%|████████▋ | 131/150 [17:20<02:31,  7.95s/it] 88%|████████▊ | 132/150 [17:28<02:23,  7.94s/it] 89%|████████▊ | 133/150 [17:36<02:14,  7.93s/it] 89%|████████▉ | 134/150 [17:44<02:07,  7.95s/it] 90%|█████████ | 135/150 [17:52<01:59,  7.95s/it] 91%|█████████ | 136/150 [18:00<01:51,  8.00s/it] 91%|█████████▏| 137/150 [18:08<01:44,  8.01s/it] 92%|█████████▏| 138/150 [18:17<01:37,  8.15s/it] 93%|█████████▎| 139/150 [18:25<01:29,  8.11s/it] 93%|█████████▎| 140/150 [18:33<01:20,  8.07s/it] 94%|█████████▍| 141/150 [18:41<01:12,  8.03s/it] 95%|█████████▍| 142/150 [18:49<01:04,  8.01s/it] 95%|█████████▌| 143/150 [18:57<00:55,  7.99s/it] 96%|█████████▌| 144/150 [19:05<00:47,  7.97s/it] 97%|█████████▋| 145/150 [19:12<00:39,  7.95s/it] 97%|█████████▋| 146/150 [19:20<00:31,  7.95s/it] 98%|█████████▊| 147/150 [19:28<00:23,  7.96s/it] 99%|█████████▊| 148/150 [19:36<00:15,  7.96s/it] 99%|█████████▉| 149/150 [19:44<00:07,  7.94s/it]100%|██████████| 150/150 [19:52<00:00,  7.94s/it]100%|██████████| 150/150 [19:52<00:00,  7.95s/it]
+wandb: updating run metadata
+wandb: uploading config.yaml
+wandb: 
+wandb: Run history:
+wandb:         loss ▄▄▄▇▄██▄▄▄▄▃▄██▄█▄▄█▇▃▇▃▆▃▆▇▆▃▂▅▆▂▂▁▂▅▁▁
+wandb: param_change ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
+wandb:  retain_loss ▁▁▁▁▁▂▁▂▂▂▂▂▂▁▂▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂█
+wandb:         step ▁▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇███
+wandb: unlearn_loss ▃█▃█▃▃██▃█▃▃▃█▃██▃▃▇▇▆▃▃▃▆▃▅▅▂▂▅▅▆▆▄▁▅▄▄
+wandb: 
+wandb: Run summary:
+wandb:         loss 0.06348
+wandb: param_change 0
+wandb:  retain_loss 0.0249
+wandb:         step 149
+wandb: unlearn_loss 0.03882
+wandb: 
+wandb: 🚀 View run muon-v1-rank-models/muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6 at: https://wandb.ai/wcs23187/Muon-constraint/runs/kstsc84q
+wandb: ⭐️ View project at: https://wandb.ai/wcs23187/Muon-constraint
+wandb: Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)
+wandb: Find logs at: ./wandb/run-20260403_161129-kstsc84q/logs
+loss: 0.103 | unlearn_loss: 0.09912 | retain_loss: 0.003784 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.05322 | unlearn_loss: 0.05005 | retain_loss: 0.003296 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1099 | unlearn_loss: 0.1055 | retain_loss: 0.004242 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.05298 | unlearn_loss: 0.04956 | retain_loss: 0.003494 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1064 | unlearn_loss: 0.1025 | retain_loss: 0.003967 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.05298 | unlearn_loss: 0.04956 | retain_loss: 0.003494 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.09619 | unlearn_loss: 0.0918 | retain_loss: 0.004456 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.05127 | unlearn_loss: 0.04736 | retain_loss: 0.003891 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1006 | unlearn_loss: 0.09619 | retain_loss: 0.004608 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.05029 | unlearn_loss: 0.04663 | retain_loss: 0.003723 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.09863 | unlearn_loss: 0.09473 | retain_loss: 0.003815 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.05273 | unlearn_loss: 0.04956 | retain_loss: 0.00325 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.09961 | unlearn_loss: 0.09619 | retain_loss: 0.003571 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.04468 | unlearn_loss: 0.04175 | retain_loss: 0.002975 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.09668 | unlearn_loss: 0.09229 | retain_loss: 0.004425 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.04419 | unlearn_loss: 0.04053 | retain_loss: 0.003708 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1025 | unlearn_loss: 0.09863 | retain_loss: 0.003754 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.04395 | unlearn_loss: 0.04077 | retain_loss: 0.003204 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.08496 | unlearn_loss: 0.08105 | retain_loss: 0.00386 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.04517 | unlearn_loss: 0.04199 | retain_loss: 0.003098 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.09717 | unlearn_loss: 0.09326 | retain_loss: 0.004089 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.04492 | unlearn_loss: 0.0415 | retain_loss: 0.003464 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1064 | unlearn_loss: 0.1011 | retain_loss: 0.005493 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.04321 | unlearn_loss: 0.03882 | retain_loss: 0.004303 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.1064 | unlearn_loss: 0.1006 | retain_loss: 0.005829 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.04443 | unlearn_loss: 0.03955 | retain_loss: 0.004913 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.125 | unlearn_loss: 0.09082 | retain_loss: 0.03394 | param_change: 0
+now we use  the mode ..  v1
+k_eff 64
+k_eff 64
+k_eff 64
+loss: 0.06348 | unlearn_loss: 0.03882 | retain_loss: 0.0249 | param_change: 0
+Saved model to models/muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6
+Saved model path to /egr/research-optml/wangc168/Muon_wmdp/rmu_wmdp/model_dir.txt with index 298
+[INFO] lm_eval wmdp (with retry)...
+[INFO] (wmdp) attempt=1 running: CUDA_VISIBLE_DEVICES=1 lm_eval --model hf --model_args pretrained=/egr/research-optml/wangc168/Muon_wmdp/rmu_wmdp/models/muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6 --device cuda:0 --batch_size 16 --log_samples --tasks wmdp --output_path "/egr/research-optml/wangc168/Muon_wmdp/rmu_wmdp/models/muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/lm_eval_logs/wmdp_results.json"
+/egr/research-optml/wangc168/anaconda3/envs/SOUL/lib/python3.9/site-packages/transformers/utils/hub.py:127: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
+  warnings.warn(
+2026-04-03:16:33:15,348 INFO     [__main__.py:279] Verbosity set to INFO
+2026-04-03:16:33:16,808 INFO     [__init__.py:491] `group` and `group_alias` keys in TaskConfigs are deprecated and will be removed in v0.4.5 of lm_eval. The new `tag` field will be used to allow for a shortcut to a group of tasks one does not wish to aggregate metrics across. `group`s which aggregate across subtasks must be only defined in a separate group config file, which will be the official way to create groups that support cross-task aggregation as in `mmlu`. Please see the v0.4.4 patch notes and our documentation: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/new_task_guide.md#advanced-group-configs for more information.
+2026-04-03:16:33:40,965 INFO     [__main__.py:376] Selected Tasks: ['wmdp']
+2026-04-03:16:33:40,998 INFO     [evaluator.py:161] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234
+2026-04-03:16:33:40,998 INFO     [evaluator.py:198] Initializing hf model, with arguments: {'pretrained': '/egr/research-optml/wangc168/Muon_wmdp/rmu_wmdp/models/muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6'}
+2026-04-03:16:33:41,195 INFO     [huggingface.py:130] Using device 'cuda:0'
+2026-04-03:16:33:41,721 INFO     [huggingface.py:366] Model parallel was set to False, max memory was not set, and device map was set to {'': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]Loading checkpoint shards:  33%|███▎      | 1/3 [00:10<00:20, 10.37s/it]Loading checkpoint shards:  67%|██████▋   | 2/3 [00:20<00:10, 10.51s/it]Loading checkpoint shards: 100%|██████████| 3/3 [00:31<00:00, 10.46s/it]Loading checkpoint shards: 100%|██████████| 3/3 [00:31<00:00, 10.46s/it]
+2026-04-03:16:34:15,990 WARNING  [task.py:337] [Task: wmdp_cyber] has_training_docs and has_validation_docs are False, using test_docs as fewshot_docs but this is not recommended.
+2026-04-03:16:34:15,990 WARNING  [task.py:337] [Task: wmdp_cyber] has_training_docs and has_validation_docs are False, using test_docs as fewshot_docs but this is not recommended.
+2026-04-03:16:34:17,327 WARNING  [task.py:337] [Task: wmdp_chem] has_training_docs and has_validation_docs are False, using test_docs as fewshot_docs but this is not recommended.
+2026-04-03:16:34:17,327 WARNING  [task.py:337] [Task: wmdp_chem] has_training_docs and has_validation_docs are False, using test_docs as fewshot_docs but this is not recommended.
+2026-04-03:16:34:18,114 WARNING  [task.py:337] [Task: wmdp_bio] has_training_docs and has_validation_docs are False, using test_docs as fewshot_docs but this is not recommended.
+2026-04-03:16:34:18,114 WARNING  [task.py:337] [Task: wmdp_bio] has_training_docs and has_validation_docs are False, using test_docs as fewshot_docs but this is not recommended.
+2026-04-03:16:34:18,191 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:34:18,191 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:34:18,191 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:34:18,191 WARNING  [model.py:422] model.chat_template was called with the chat_template set to False or None. Therefore no chat template will be applied. Make sure this is an intended behavior.
+2026-04-03:16:34:18,192 INFO     [task.py:423] Building contexts for wmdp_bio on rank 0...
+  0%|          | 0/1273 [00:00<?, ?it/s]  8%|▊         | 104/1273 [00:00<00:01, 1039.82it/s] 18%|█▊        | 223/1273 [00:00<00:00, 1123.37it/s] 27%|██▋       | 342/1273 [00:00<00:00, 1150.04it/s] 36%|███▌      | 461/1273 [00:00<00:00, 1162.06it/s] 46%|████▌     | 580/1273 [00:00<00:00, 1170.58it/s] 55%|█████▍    | 699/1273 [00:00<00:00, 1175.36it/s] 64%|██████▍   | 818/1273 [00:00<00:00, 1176.99it/s] 74%|███████▎  | 937/1273 [00:00<00:00, 1179.19it/s] 83%|████████▎ | 1056/1273 [00:00<00:00, 1180.14it/s] 92%|█████████▏| 1175/1273 [00:01<00:00, 1182.95it/s]100%|██████████| 1273/1273 [00:01<00:00, 1170.01it/s]
+2026-04-03:16:34:19,318 INFO     [task.py:423] Building contexts for wmdp_chem on rank 0...
+  0%|          | 0/408 [00:00<?, ?it/s] 29%|██▉       | 119/408 [00:00<00:00, 1187.88it/s] 58%|█████▊    | 238/408 [00:00<00:00, 1185.27it/s] 88%|████████▊ | 357/408 [00:00<00:00, 1186.87it/s]100%|██████████| 408/408 [00:00<00:00, 1186.07it/s]
+2026-04-03:16:34:19,671 INFO     [task.py:423] Building contexts for wmdp_cyber on rank 0...
+  0%|          | 0/1987 [00:00<?, ?it/s]  6%|▌         | 118/1987 [00:00<00:01, 1172.69it/s] 12%|█▏        | 237/1987 [00:00<00:01, 1178.36it/s] 18%|█▊        | 355/1987 [00:00<00:01, 1176.40it/s] 24%|██▍       | 473/1987 [00:00<00:01, 1177.06it/s] 30%|██▉       | 591/1987 [00:00<00:01, 1178.02it/s] 36%|███▌      | 709/1987 [00:00<00:01, 1178.19it/s] 42%|████▏     | 828/1987 [00:00<00:00, 1180.62it/s] 48%|████▊     | 947/1987 [00:00<00:00, 1181.94it/s] 54%|█████▎    | 1066/1987 [00:00<00:00, 1182.56it/s] 60%|█████▉    | 1185/1987 [00:01<00:00, 1181.05it/s] 66%|██████▌   | 1304/1987 [00:01<00:00, 1180.09it/s] 72%|███████▏  | 1423/1987 [00:01<00:00, 1180.01it/s] 78%|███████▊  | 1542/1987 [00:01<00:00, 1180.03it/s] 84%|████████▎ | 1661/1987 [00:01<00:00, 1180.34it/s] 90%|████████▉ | 1780/1987 [00:01<00:00, 1180.22it/s] 96%|█████████▌| 1899/1987 [00:01<00:00, 1179.68it/s]100%|██████████| 1987/1987 [00:01<00:00, 1179.97it/s]
+2026-04-03:16:34:21,395 INFO     [evaluator.py:465] Running loglikelihood requests
+Running loglikelihood requests:   0%|          | 0/14672 [00:00<?, ?it/s]We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
+Running loglikelihood requests:   0%|          | 1/14672 [00:07<28:36:15,  7.02s/it]Running loglikelihood requests:   0%|          | 65/14672 [00:08<24:55,  9.77it/s]  Running loglikelihood requests:   1%|          | 129/14672 [00:10<13:49, 17.52it/s]Running loglikelihood requests:   1%|▏         | 193/14672 [00:11<10:01, 24.08it/s]Running loglikelihood requests:   2%|▏         | 257/14672 [00:13<08:04, 29.77it/s]Running loglikelihood requests:   2%|▏         | 321/14672 [00:14<06:53, 34.71it/s]Running loglikelihood requests:   3%|▎         | 385/14672 [00:15<06:03, 39.31it/s]Running loglikelihood requests:   3%|▎         | 449/14672 [00:16<05:27, 43.45it/s]Running loglikelihood requests:   3%|▎         | 513/14672 [00:17<05:00, 47.14it/s]Running loglikelihood requests:   4%|▍         | 577/14672 [00:19<04:39, 50.43it/s]Running loglikelihood requests:   4%|▍         | 641/14672 [00:20<04:20, 53.83it/s]Running loglikelihood requests:   5%|▍         | 705/14672 [00:20<04:04, 57.05it/s]Running loglikelihood requests:   5%|▌         | 769/14672 [00:21<03:52, 59.81it/s]Running loglikelihood requests:   6%|▌         | 833/14672 [00:22<03:40, 62.74it/s]Running loglikelihood requests:   6%|▌         | 897/14672 [00:23<03:30, 65.36it/s]Running loglikelihood requests:   7%|▋         | 961/14672 [00:24<03:22, 67.78it/s]Running loglikelihood requests:   7%|▋         | 1025/14672 [00:25<03:15, 69.87it/s]Running loglikelihood requests:   7%|▋         | 1089/14672 [00:26<03:08, 71.88it/s]Running loglikelihood requests:   8%|▊         | 1153/14672 [00:27<03:00, 74.76it/s]Running loglikelihood requests:   8%|▊         | 1217/14672 [00:27<02:53, 77.72it/s]Running loglikelihood requests:   9%|▊         | 1281/14672 [00:28<02:44, 81.20it/s]Running loglikelihood requests:   9%|▉         | 1345/14672 [00:29<02:37, 84.47it/s]Running loglikelihood requests:  10%|▉         | 1409/14672 [00:29<02:31, 87.47it/s]Running loglikelihood requests:  10%|█         | 1473/14672 [00:30<02:26, 90.20it/s]Running loglikelihood requests:  10%|█         | 1537/14672 [00:31<02:22, 92.17it/s]Running loglikelihood requests:  11%|█         | 1601/14672 [00:31<02:19, 93.56it/s]Running loglikelihood requests:  11%|█▏        | 1665/14672 [00:32<02:14, 96.35it/s]Running loglikelihood requests:  12%|█▏        | 1729/14672 [00:33<02:11, 98.52it/s]Running loglikelihood requests:  12%|█▏        | 1793/14672 [00:33<02:05, 102.71it/s]Running loglikelihood requests:  13%|█▎        | 1857/14672 [00:34<02:00, 106.25it/s]Running loglikelihood requests:  13%|█▎        | 1921/14672 [00:34<01:55, 110.12it/s]Running loglikelihood requests:  14%|█▎        | 1985/14672 [00:35<01:51, 113.59it/s]Running loglikelihood requests:  14%|█▍        | 2049/14672 [00:35<01:47, 117.22it/s]Running loglikelihood requests:  14%|█▍        | 2113/14672 [00:36<01:44, 120.67it/s]Running loglikelihood requests:  15%|█▍        | 2177/14672 [00:36<01:39, 125.08it/s]Running loglikelihood requests:  15%|█▌        | 2241/14672 [00:37<01:36, 129.12it/s]Running loglikelihood requests:  16%|█▌        | 2305/14672 [00:37<01:32, 133.10it/s]Running loglikelihood requests:  16%|█▌        | 2369/14672 [00:38<01:29, 137.36it/s]Running loglikelihood requests:  17%|█▋        | 2433/14672 [00:38<01:26, 141.13it/s]Running loglikelihood requests:  17%|█▋        | 2497/14672 [00:38<01:23, 145.27it/s]Running loglikelihood requests:  17%|█▋        | 2561/14672 [00:39<01:20, 149.61it/s]Running loglikelihood requests:  18%|█▊        | 2625/14672 [00:39<01:17, 154.58it/s]Running loglikelihood requests:  18%|█▊        | 2689/14672 [00:40<01:15, 159.42it/s]Running loglikelihood requests:  19%|█▉        | 2753/14672 [00:40<01:12, 164.14it/s]Running loglikelihood requests:  19%|█▉        | 2817/14672 [00:40<01:10, 169.25it/s]Running loglikelihood requests:  20%|█▉        | 2881/14672 [00:41<01:07, 174.44it/s]Running loglikelihood requests:  20%|██        | 2945/14672 [00:41<01:05, 179.07it/s]Running loglikelihood requests:  21%|██        | 3009/14672 [00:41<01:02, 185.17it/s]Running loglikelihood requests:  21%|██        | 3073/14672 [00:42<01:00, 190.69it/s]Running loglikelihood requests:  21%|██▏       | 3137/14672 [00:42<00:58, 197.64it/s]Running loglikelihood requests:  22%|██▏       | 3201/14672 [00:42<00:55, 206.18it/s]Running loglikelihood requests:  22%|██▏       | 3265/14672 [00:42<00:53, 213.77it/s]Running loglikelihood requests:  23%|██▎       | 3329/14672 [00:43<00:51, 221.86it/s]Running loglikelihood requests:  23%|██▎       | 3393/14672 [00:43<00:49, 228.72it/s]Running loglikelihood requests:  24%|██▎       | 3457/14672 [00:43<00:47, 236.26it/s]Running loglikelihood requests:  24%|██▍       | 3521/14672 [00:43<00:45, 245.82it/s]Running loglikelihood requests:  24%|██▍       | 3585/14672 [00:44<00:43, 255.11it/s]Running loglikelihood requests:  25%|██▍       | 3649/14672 [00:44<00:41, 263.32it/s]Running loglikelihood requests:  25%|██▌       | 3713/14672 [00:44<00:40, 269.28it/s]Running loglikelihood requests:  26%|██▌       | 3777/14672 [00:44<00:39, 276.43it/s]Running loglikelihood requests:  26%|██▌       | 3841/14672 [00:45<00:38, 282.67it/s]Running loglikelihood requests:  27%|██▋       | 3905/14672 [00:45<00:37, 290.03it/s]Running loglikelihood requests:  27%|██▋       | 3969/14672 [00:45<00:36, 296.27it/s]Running loglikelihood requests:  27%|██▋       | 4033/14672 [00:45<00:35, 301.75it/s]Running loglikelihood requests:  28%|██▊       | 4097/14672 [00:45<00:33, 313.41it/s]Running loglikelihood requests:  28%|██▊       | 4161/14672 [00:46<00:32, 321.67it/s]Running loglikelihood requests:  29%|██▉       | 4225/14672 [00:46<00:31, 332.12it/s]Running loglikelihood requests:  29%|██▉       | 4289/14672 [00:46<00:30, 343.22it/s]Running loglikelihood requests:  30%|██▉       | 4353/14672 [00:46<00:29, 353.59it/s]Running loglikelihood requests:  30%|███       | 4417/14672 [00:46<00:28, 365.45it/s]Running loglikelihood requests:  31%|███       | 4481/14672 [00:46<00:27, 377.39it/s]Running loglikelihood requests:  31%|███       | 4545/14672 [00:46<00:25, 394.12it/s]Running loglikelihood requests:  31%|███▏      | 4609/14672 [00:47<00:24, 407.45it/s]Running loglikelihood requests:  32%|███▏      | 4673/14672 [00:47<00:23, 418.20it/s]Running loglikelihood requests:  32%|███▏      | 4737/14672 [00:47<00:23, 426.88it/s]Running loglikelihood requests:  33%|███▎      | 4801/14672 [00:47<00:22, 432.46it/s]Running loglikelihood requests:  33%|███▎      | 4865/14672 [00:47<00:22, 441.18it/s]Running loglikelihood requests:  34%|███▎      | 4929/14672 [00:47<00:21, 449.29it/s]Running loglikelihood requests:  34%|███▍      | 4993/14672 [00:47<00:21, 457.18it/s]Running loglikelihood requests:  34%|███▍      | 5057/14672 [00:48<00:20, 462.30it/s]Running loglikelihood requests:  35%|███▍      | 5121/14672 [00:48<00:20, 466.40it/s]Running loglikelihood requests:  35%|███▌      | 5185/14672 [00:48<00:20, 469.62it/s]Running loglikelihood requests:  36%|███▌      | 5249/14672 [00:48<00:19, 479.28it/s]Running loglikelihood requests:  36%|███▌      | 5313/14672 [00:48<00:19, 486.88it/s]Running loglikelihood requests:  37%|███▋      | 5377/14672 [00:48<00:18, 491.95it/s]Running loglikelihood requests:  37%|███▋      | 5441/14672 [00:48<00:18, 497.51it/s]Running loglikelihood requests:  38%|███▊      | 5505/14672 [00:49<00:18, 500.18it/s]Running loglikelihood requests:  38%|███▊      | 5569/14672 [00:49<00:18, 503.95it/s]Running loglikelihood requests:  38%|███▊      | 5633/14672 [00:49<00:17, 508.41it/s]Running loglikelihood requests:  39%|███▉      | 5697/14672 [00:49<00:17, 512.21it/s]Running loglikelihood requests:  39%|███▉      | 5761/14672 [00:49<00:17, 515.34it/s]Running loglikelihood requests:  40%|███▉      | 5825/14672 [00:49<00:17, 517.41it/s]Running loglikelihood requests:  40%|████      | 5889/14672 [00:49<00:16, 528.06it/s]Running loglikelihood requests:  41%|████      | 5953/14672 [00:49<00:16, 537.50it/s]Running loglikelihood requests:  41%|████      | 6017/14672 [00:49<00:15, 544.64it/s]Running loglikelihood requests:  41%|████▏     | 6081/14672 [00:50<00:15, 549.42it/s]Running loglikelihood requests:  42%|████▏     | 6145/14672 [00:50<00:15, 553.12it/s]Running loglikelihood requests:  42%|████▏     | 6209/14672 [00:50<00:15, 557.93it/s]Running loglikelihood requests:  43%|████▎     | 6273/14672 [00:50<00:14, 560.26it/s]Running loglikelihood requests:  43%|████▎     | 6337/14672 [00:50<00:14, 563.14it/s]Running loglikelihood requests:  44%|████▎     | 6401/14672 [00:50<00:14, 565.75it/s]Running loglikelihood requests:  44%|████▍     | 6465/14672 [00:50<00:14, 567.08it/s]Running loglikelihood requests:  44%|████▍     | 6529/14672 [00:50<00:14, 569.49it/s]Running loglikelihood requests:  45%|████▍     | 6593/14672 [00:50<00:14, 569.81it/s]Running loglikelihood requests:  45%|████▌     | 6657/14672 [00:51<00:14, 568.82it/s]Running loglikelihood requests:  46%|████▌     | 6721/14672 [00:51<00:13, 569.32it/s]Running loglikelihood requests:  46%|████▌     | 6785/14672 [00:51<00:13, 570.94it/s]Running loglikelihood requests:  47%|████▋     | 6872/14672 [00:51<00:11, 653.60it/s]Running loglikelihood requests:  47%|████▋     | 6967/14672 [00:51<00:10, 737.51it/s]Running loglikelihood requests:  48%|████▊     | 7042/14672 [00:51<00:13, 586.29it/s]Running loglikelihood requests:  49%|████▊     | 7140/14672 [00:51<00:11, 682.88it/s]Running loglikelihood requests:  49%|████▉     | 7233/14672 [00:52<00:12, 602.75it/s]Running loglikelihood requests:  50%|████▉     | 7310/14672 [00:52<00:11, 641.27it/s]Running loglikelihood requests:  50%|█████     | 7394/14672 [00:52<00:10, 690.46it/s]Running loglikelihood requests:  51%|█████     | 7486/14672 [00:52<00:09, 750.40it/s]Running loglikelihood requests:  52%|█████▏    | 7566/14672 [00:52<00:11, 613.66it/s]Running loglikelihood requests:  52%|█████▏    | 7681/14672 [00:52<00:11, 609.26it/s]Running loglikelihood requests:  53%|█████▎    | 7792/14672 [00:52<00:09, 718.48it/s]Running loglikelihood requests:  54%|█████▎    | 7873/14672 [00:52<00:11, 613.85it/s]Running loglikelihood requests:  55%|█████▍    | 8001/14672 [00:53<00:10, 634.32it/s]Running loglikelihood requests:  55%|█████▌    | 8129/14672 [00:53<00:10, 647.81it/s]Running loglikelihood requests:  56%|█████▋    | 8255/14672 [00:53<00:08, 770.20it/s]Running loglikelihood requests:  57%|█████▋    | 8341/14672 [00:53<00:09, 671.09it/s]Running loglikelihood requests:  58%|█████▊    | 8449/14672 [00:53<00:09, 647.92it/s]Running loglikelihood requests:  58%|█████▊    | 8577/14672 [00:53<00:09, 668.34it/s]Running loglikelihood requests:  59%|█████▉    | 8705/14672 [00:54<00:08, 682.42it/s]Running loglikelihood requests:  60%|██████    | 8833/14672 [00:54<00:08, 694.59it/s]Running loglikelihood requests:  61%|██████    | 8961/14672 [00:54<00:08, 703.87it/s]Running loglikelihood requests:  62%|██████▏   | 9089/14672 [00:54<00:07, 709.22it/s]Running loglikelihood requests:  63%|██████▎   | 9217/14672 [00:54<00:07, 712.80it/s]Running loglikelihood requests:  64%|██████▎   | 9345/14672 [00:55<00:07, 716.78it/s]Running loglikelihood requests:  65%|██████▍   | 9473/14672 [00:55<00:07, 720.24it/s]Running loglikelihood requests:  65%|██████▌   | 9601/14672 [00:55<00:07, 723.19it/s]Running loglikelihood requests:  66%|██████▋   | 9729/14672 [00:55<00:06, 733.11it/s]Running loglikelihood requests:  67%|██████▋   | 9857/14672 [00:55<00:06, 749.42it/s]Running loglikelihood requests:  68%|██████▊   | 9985/14672 [00:55<00:06, 761.63it/s]Running loglikelihood requests:  69%|██████▉   | 10113/14672 [00:56<00:05, 771.50it/s]Running loglikelihood requests:  70%|██████▉   | 10241/14672 [00:56<00:05, 779.97it/s]Running loglikelihood requests:  71%|███████   | 10369/14672 [00:56<00:05, 784.02it/s]Running loglikelihood requests:  72%|███████▏  | 10497/14672 [00:56<00:05, 789.79it/s]Running loglikelihood requests:  72%|███████▏  | 10625/14672 [00:56<00:05, 794.82it/s]Running loglikelihood requests:  73%|███████▎  | 10753/14672 [00:56<00:04, 797.44it/s]Running loglikelihood requests:  74%|███████▍  | 10881/14672 [00:57<00:04, 801.15it/s]Running loglikelihood requests:  75%|███████▌  | 11009/14672 [00:57<00:04, 803.77it/s]Running loglikelihood requests:  76%|███████▌  | 11137/14672 [00:57<00:04, 805.88it/s]Running loglikelihood requests:  77%|███████▋  | 11265/14672 [00:57<00:04, 808.79it/s]Running loglikelihood requests:  78%|███████▊  | 11393/14672 [00:57<00:04, 810.42it/s]Running loglikelihood requests:  79%|███████▊  | 11521/14672 [00:57<00:03, 812.74it/s]Running loglikelihood requests:  79%|███████▉  | 11649/14672 [00:57<00:03, 816.03it/s]Running loglikelihood requests:  80%|████████  | 11777/14672 [00:58<00:03, 839.68it/s]Running loglikelihood requests:  81%|████████  | 11905/14672 [00:58<00:03, 858.12it/s]Running loglikelihood requests:  82%|████████▏ | 12033/14672 [00:58<00:03, 873.49it/s]Running loglikelihood requests:  83%|████████▎ | 12161/14672 [00:58<00:02, 886.31it/s]Running loglikelihood requests:  84%|████████▍ | 12289/14672 [00:58<00:02, 896.07it/s]Running loglikelihood requests:  85%|████████▍ | 12417/14672 [00:58<00:02, 904.78it/s]Running loglikelihood requests:  86%|████████▌ | 12545/14672 [00:58<00:02, 911.92it/s]Running loglikelihood requests:  86%|████████▋ | 12673/14672 [00:59<00:02, 919.28it/s]Running loglikelihood requests:  87%|████████▋ | 12801/14672 [00:59<00:02, 925.84it/s]Running loglikelihood requests:  88%|████████▊ | 12929/14672 [00:59<00:01, 932.72it/s]Running loglikelihood requests:  89%|████████▉ | 13057/14672 [00:59<00:01, 936.23it/s]Running loglikelihood requests:  90%|████████▉ | 13185/14672 [00:59<00:01, 942.82it/s]Running loglikelihood requests:  91%|█████████ | 13313/14672 [00:59<00:01, 948.53it/s]Running loglikelihood requests:  92%|█████████▏| 13441/14672 [00:59<00:01, 951.85it/s]Running loglikelihood requests:  92%|█████████▏| 13569/14672 [01:00<00:01, 956.08it/s]Running loglikelihood requests:  93%|█████████▎| 13697/14672 [01:00<00:01, 961.39it/s]Running loglikelihood requests:  94%|█████████▍| 13825/14672 [01:00<00:00, 983.74it/s]Running loglikelihood requests:  95%|█████████▌| 13953/14672 [01:00<00:00, 1022.58it/s]Running loglikelihood requests:  96%|█████████▌| 14081/14672 [01:00<00:00, 1057.84it/s]Running loglikelihood requests:  97%|█████████▋| 14209/14672 [01:00<00:00, 1082.58it/s]Running loglikelihood requests:  98%|█████████▊| 14337/14672 [01:00<00:00, 1106.97it/s]Running loglikelihood requests:  99%|█████████▊| 14465/14672 [01:00<00:00, 1127.11it/s]Running loglikelihood requests:  99%|█████████▉| 14593/14672 [01:00<00:00, 1144.64it/s]Running loglikelihood requests: 100%|██████████| 14672/14672 [01:00<00:00, 240.60it/s] 
+2026-04-03:16:35:29,404 WARNING  [huggingface.py:1344] Failed to get model SHA for /egr/research-optml/wangc168/Muon_wmdp/rmu_wmdp/models/muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6 at revision main. Error: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/egr/research-optml/wangc168/Muon_wmdp/rmu_wmdp/models/muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6'. Use `repo_type` argument if needed.
+2026-04-03:16:35:44,231 INFO     [evaluation_tracker.py:206] Saving results aggregated
+2026-04-03:16:35:44,240 INFO     [evaluation_tracker.py:287] Saving per-sample results for: wmdp_bio
+2026-04-03:16:35:45,155 INFO     [evaluation_tracker.py:287] Saving per-sample results for: wmdp_chem
+2026-04-03:16:35:45,449 INFO     [evaluation_tracker.py:287] Saving per-sample results for: wmdp_cyber
+hf (pretrained=/egr/research-optml/wangc168/Muon_wmdp/rmu_wmdp/models/muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 16
+|    Tasks    |Version|Filter|n-shot|Metric|   |Value |   |Stderr|
+|-------------|------:|------|-----:|------|---|-----:|---|-----:|
+|wmdp         |      1|none  |      |acc   |↑  |0.3132|±  |0.0076|
+| - wmdp_bio  |      1|none  |     0|acc   |↑  |0.3009|±  |0.0129|
+| - wmdp_chem |      1|none  |     0|acc   |↑  |0.4583|±  |0.0247|
+| - wmdp_cyber|      1|none  |     0|acc   |↑  |0.2914|±  |0.0102|
+
+|Groups|Version|Filter|n-shot|Metric|   |Value |   |Stderr|
+|------|------:|------|------|------|---|-----:|---|-----:|
+|wmdp  |      1|none  |      |acc   |↑  |0.3132|±  |0.0076|
+
+[INFO] (wmdp) success (found result table and non-empty json).
+[INFO] lm_eval mmlu (with retry)...
+[INFO] (mmlu) attempt=1 running: CUDA_VISIBLE_DEVICES=1 lm_eval --model hf --model_args pretrained=/egr/research-optml/wangc168/Muon_wmdp/rmu_wmdp/models/muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6 --device cuda:0 --batch_size 16 --log_samples --tasks mmlu --output_path "/egr/research-optml/wangc168/Muon_wmdp/rmu_wmdp/models/muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/lm_eval_logs/mmlu_results.json"
+/egr/research-optml/wangc168/anaconda3/envs/SOUL/lib/python3.9/site-packages/transformers/utils/hub.py:127: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
+  warnings.warn(
+2026-04-03:16:35:59,252 INFO     [__main__.py:279] Verbosity set to INFO
+2026-04-03:16:35:59,682 INFO     [__init__.py:491] `group` and `group_alias` keys in TaskConfigs are deprecated and will be removed in v0.4.5 of lm_eval. The new `tag` field will be used to allow for a shortcut to a group of tasks one does not wish to aggregate metrics across. `group`s which aggregate across subtasks must be only defined in a separate group config file, which will be the official way to create groups that support cross-task aggregation as in `mmlu`. Please see the v0.4.4 patch notes and our documentation: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/new_task_guide.md#advanced-group-configs for more information.
+2026-04-03:16:36:07,612 INFO     [__main__.py:376] Selected Tasks: ['mmlu']
+2026-04-03:16:36:07,615 INFO     [evaluator.py:161] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234
+2026-04-03:16:36:07,615 INFO     [evaluator.py:198] Initializing hf model, with arguments: {'pretrained': '/egr/research-optml/wangc168/Muon_wmdp/rmu_wmdp/models/muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6'}
+2026-04-03:16:36:07,760 INFO     [huggingface.py:130] Using device 'cuda:0'
+2026-04-03:16:36:07,992 INFO     [huggingface.py:366] Model parallel was set to False, max memory was not set, and device map was set to {'': 'cuda:0'}
+Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]Loading checkpoint shards:  33%|███▎      | 1/3 [00:01<00:02,  1.13s/it]Loading checkpoint shards:  67%|██████▋   | 2/3 [00:02<00:01,  1.11s/it]Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.03s/it]Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.05s/it]
+2026-04-03:16:37:53,518 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,518 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,518 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,519 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,519 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,519 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,519 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,519 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,519 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,519 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,519 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,519 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,519 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,519 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,519 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,519 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,519 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,519 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,519 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,519 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,519 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,519 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,520 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,520 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,520 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,520 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,520 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,520 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,520 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,520 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,520 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,520 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,520 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,520 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,520 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,520 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,520 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,520 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,520 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,520 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,520 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,521 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,521 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,521 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,521 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,521 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,521 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,521 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,521 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,521 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,521 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,521 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,521 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,521 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,521 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,521 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,521 INFO     [evaluator.py:279] Setting fewshot random generator seed to 1234
+2026-04-03:16:37:53,521 WARNING  [model.py:422] model.chat_template was called with the chat_template set to False or None. Therefore no chat template will be applied. Make sure this is an intended behavior.
+2026-04-03:16:37:53,529 INFO     [task.py:423] Building contexts for mmlu_high_school_mathematics on rank 0...
+  0%|          | 0/270 [00:00<?, ?it/s] 30%|███       | 82/270 [00:00<00:00, 819.04it/s] 74%|███████▍  | 200/270 [00:00<00:00, 1029.54it/s]100%|██████████| 270/270 [00:00<00:00, 1038.65it/s]
+2026-04-03:16:37:53,804 INFO     [task.py:423] Building contexts for mmlu_college_chemistry on rank 0...
+  0%|          | 0/100 [00:00<?, ?it/s]100%|██████████| 100/100 [00:00<00:00, 1178.71it/s]
+2026-04-03:16:37:53,891 INFO     [task.py:423] Building contexts for mmlu_computer_security on rank 0...
+  0%|          | 0/100 [00:00<?, ?it/s]100%|██████████| 100/100 [00:00<00:00, 1190.39it/s]
+2026-04-03:16:37:53,978 INFO     [task.py:423] Building contexts for mmlu_conceptual_physics on rank 0...
+  0%|          | 0/235 [00:00<?, ?it/s] 51%|█████     | 120/235 [00:00<00:00, 1197.33it/s]100%|██████████| 235/235 [00:00<00:00, 1192.51it/s]
+2026-04-03:16:37:54,181 INFO     [task.py:423] Building contexts for mmlu_college_biology on rank 0...
+  0%|          | 0/144 [00:00<?, ?it/s] 83%|████████▎ | 120/144 [00:00<00:00, 1193.77it/s]100%|██████████| 144/144 [00:00<00:00, 1193.02it/s]
+2026-04-03:16:37:54,306 INFO     [task.py:423] Building contexts for mmlu_college_computer_science on rank 0...
+  0%|          | 0/100 [00:00<?, ?it/s]100%|██████████| 100/100 [00:00<00:00, 1187.27it/s]
+2026-04-03:16:37:54,392 INFO     [task.py:423] Building contexts for mmlu_abstract_algebra on rank 0...
+  0%|          | 0/100 [00:00<?, ?it/s]100%|██████████| 100/100 [00:00<00:00, 1204.80it/s]
+2026-04-03:16:37:54,478 INFO     [task.py:423] Building contexts for mmlu_machine_learning on rank 0...
+  0%|          | 0/112 [00:00<?, ?it/s]100%|██████████| 112/112 [00:00<00:00, 1202.25it/s]
+2026-04-03:16:37:54,575 INFO     [task.py:423] Building contexts for mmlu_high_school_computer_science on rank 0...
+  0%|          | 0/100 [00:00<?, ?it/s]100%|██████████| 100/100 [00:00<00:00, 1191.12it/s]
+2026-04-03:16:37:54,661 INFO     [task.py:423] Building contexts for mmlu_college_physics on rank 0...
+  0%|          | 0/102 [00:00<?, ?it/s]100%|██████████| 102/102 [00:00<00:00, 1194.07it/s]
+2026-04-03:16:37:54,750 INFO     [task.py:423] Building contexts for mmlu_high_school_statistics on rank 0...
+  0%|          | 0/216 [00:00<?, ?it/s] 55%|█████▌    | 119/216 [00:00<00:00, 1186.72it/s]100%|██████████| 216/216 [00:00<00:00, 1187.95it/s]
+2026-04-03:16:37:54,937 INFO     [task.py:423] Building contexts for mmlu_high_school_physics on rank 0...
+  0%|          | 0/151 [00:00<?, ?it/s] 79%|███████▉  | 119/151 [00:00<00:00, 1189.56it/s]100%|██████████| 151/151 [00:00<00:00, 1188.83it/s]
+2026-04-03:16:37:55,068 INFO     [task.py:423] Building contexts for mmlu_elementary_mathematics on rank 0...
+  0%|          | 0/378 [00:00<?, ?it/s] 31%|███       | 118/378 [00:00<00:00, 1179.87it/s] 63%|██████▎   | 237/378 [00:00<00:00, 1185.01it/s] 94%|█████████▍| 356/378 [00:00<00:00, 1184.24it/s]100%|██████████| 378/378 [00:00<00:00, 1183.84it/s]
+2026-04-03:16:37:55,396 INFO     [task.py:423] Building contexts for mmlu_electrical_engineering on rank 0...
+  0%|          | 0/145 [00:00<?, ?it/s] 83%|████████▎ | 120/145 [00:00<00:00, 1198.54it/s]100%|██████████| 145/145 [00:00<00:00, 1195.24it/s]
+2026-04-03:16:37:55,521 INFO     [task.py:423] Building contexts for mmlu_high_school_biology on rank 0...
+  0%|          | 0/310 [00:00<?, ?it/s] 39%|███▊      | 120/310 [00:00<00:00, 1194.88it/s] 77%|███████▋  | 240/310 [00:00<00:00, 1189.11it/s]100%|██████████| 310/310 [00:00<00:00, 1189.86it/s]
+2026-04-03:16:37:55,790 INFO     [task.py:423] Building contexts for mmlu_astronomy on rank 0...
+  0%|          | 0/152 [00:00<?, ?it/s] 79%|███████▉  | 120/152 [00:00<00:00, 1194.28it/s]100%|██████████| 152/152 [00:00<00:00, 1191.56it/s]
+2026-04-03:16:37:55,921 INFO     [task.py:423] Building contexts for mmlu_anatomy on rank 0...
+  0%|          | 0/135 [00:00<?, ?it/s] 89%|████████▉ | 120/135 [00:00<00:00, 1199.04it/s]100%|██████████| 135/135 [00:00<00:00, 1196.98it/s]
+2026-04-03:16:37:56,038 INFO     [task.py:423] Building contexts for mmlu_college_mathematics on rank 0...
+  0%|          | 0/100 [00:00<?, ?it/s]100%|██████████| 100/100 [00:00<00:00, 1187.91it/s]
+2026-04-03:16:37:56,124 INFO     [task.py:423] Building contexts for mmlu_high_school_chemistry on rank 0...
+  0%|          | 0/203 [00:00<?, ?it/s] 59%|█████▉    | 120/203 [00:00<00:00, 1194.98it/s]100%|██████████| 203/203 [00:00<00:00, 1195.32it/s]
+2026-04-03:16:37:56,299 INFO     [task.py:423] Building contexts for mmlu_management on rank 0...
+  0%|          | 0/103 [00:00<?, ?it/s]100%|██████████| 103/103 [00:00<00:00, 1196.33it/s]
+2026-04-03:16:37:56,388 INFO     [task.py:423] Building contexts for mmlu_clinical_knowledge on rank 0...
+  0%|          | 0/265 [00:00<?, ?it/s] 45%|████▌     | 120/265 [00:00<00:00, 1190.62it/s] 91%|█████████ | 240/265 [00:00<00:00, 1188.99it/s]100%|██████████| 265/265 [00:00<00:00, 1188.80it/s]
+2026-04-03:16:37:56,618 INFO     [task.py:423] Building contexts for mmlu_professional_medicine on rank 0...
+  0%|          | 0/272 [00:00<?, ?it/s] 44%|████▍     | 120/272 [00:00<00:00, 1192.97it/s] 88%|████████▊ | 240/272 [00:00<00:00, 1191.79it/s]100%|██████████| 272/272 [00:00<00:00, 1191.97it/s]
+2026-04-03:16:37:56,853 INFO     [task.py:423] Building contexts for mmlu_human_aging on rank 0...
+  0%|          | 0/223 [00:00<?, ?it/s] 33%|███▎      | 74/223 [00:00<00:00, 340.71it/s] 87%|████████▋ | 193/223 [00:00<00:00, 676.52it/s]100%|██████████| 223/223 [00:00<00:00, 650.37it/s]
+2026-04-03:16:37:57,201 INFO     [task.py:423] Building contexts for mmlu_professional_accounting on rank 0...
+  0%|          | 0/282 [00:00<?, ?it/s] 42%|████▏     | 119/282 [00:00<00:00, 1189.22it/s] 84%|████████▍ | 238/282 [00:00<00:00, 1186.54it/s]100%|██████████| 282/282 [00:00<00:00, 1185.98it/s]
+2026-04-03:16:37:57,447 INFO     [task.py:423] Building contexts for mmlu_college_medicine on rank 0...
+  0%|          | 0/173 [00:00<?, ?it/s] 69%|██████▉   | 119/173 [00:00<00:00, 1189.56it/s]100%|██████████| 173/173 [00:00<00:00, 1191.94it/s]
+2026-04-03:16:37:57,596 INFO     [task.py:423] Building contexts for mmlu_medical_genetics on rank 0...
+  0%|          | 0/100 [00:00<?, ?it/s]100%|██████████| 100/100 [00:00<00:00, 1192.51it/s]
+2026-04-03:16:37:57,683 INFO     [task.py:423] Building contexts for mmlu_miscellaneous on rank 0...
+  0%|          | 0/783 [00:00<?, ?it/s] 15%|█▌        | 120/783 [00:00<00:00, 1197.11it/s] 31%|███       | 240/783 [00:00<00:00, 1193.18it/s] 46%|████▌     | 360/783 [00:00<00:00, 1190.30it/s] 61%|██████▏   | 480/783 [00:00<00:00, 1191.35it/s] 77%|███████▋  | 600/783 [00:00<00:00, 1189.42it/s] 92%|█████████▏| 720/783 [00:00<00:00, 1190.78it/s]100%|██████████| 783/783 [00:00<00:00, 1190.75it/s]
+2026-04-03:16:37:58,358 INFO     [task.py:423] Building contexts for mmlu_global_facts on rank 0...
+  0%|          | 0/100 [00:00<?, ?it/s]100%|██████████| 100/100 [00:00<00:00, 1197.45it/s]
+2026-04-03:16:37:58,445 INFO     [task.py:423] Building contexts for mmlu_virology on rank 0...
+  0%|          | 0/166 [00:00<?, ?it/s] 72%|███████▏  | 120/166 [00:00<00:00, 1198.06it/s]100%|██████████| 166/166 [00:00<00:00, 1197.94it/s]
+2026-04-03:16:37:58,587 INFO     [task.py:423] Building contexts for mmlu_business_ethics on rank 0...
+  0%|          | 0/100 [00:00<?, ?it/s]100%|██████████| 100/100 [00:00<00:00, 1196.69it/s]
+2026-04-03:16:37:58,674 INFO     [task.py:423] Building contexts for mmlu_nutrition on rank 0...
+  0%|          | 0/306 [00:00<?, ?it/s] 39%|███▉      | 120/306 [00:00<00:00, 1198.40it/s] 78%|███████▊  | 240/306 [00:00<00:00, 1194.25it/s]100%|██████████| 306/306 [00:00<00:00, 1193.85it/s]
+2026-04-03:16:37:58,937 INFO     [task.py:423] Building contexts for mmlu_marketing on rank 0...
+  0%|          | 0/234 [00:00<?, ?it/s] 51%|█████▏    | 120/234 [00:00<00:00, 1194.33it/s]100%|██████████| 234/234 [00:00<00:00, 1194.43it/s]
+2026-04-03:16:37:59,139 INFO     [task.py:423] Building contexts for mmlu_professional_psychology on rank 0...
+  0%|          | 0/612 [00:00<?, ?it/s] 20%|█▉        | 120/612 [00:00<00:00, 1190.56it/s] 39%|███▉      | 240/612 [00:00<00:00, 1191.50it/s] 59%|█████▉    | 360/612 [00:00<00:00, 1193.22it/s] 78%|███████▊  | 480/612 [00:00<00:00, 1192.31it/s] 98%|█████████▊| 600/612 [00:00<00:00, 1193.37it/s]100%|██████████| 612/612 [00:00<00:00, 1192.52it/s]
+2026-04-03:16:37:59,667 INFO     [task.py:423] Building contexts for mmlu_high_school_geography on rank 0...
+  0%|          | 0/198 [00:00<?, ?it/s] 61%|██████    | 120/198 [00:00<00:00, 1190.72it/s]100%|██████████| 198/198 [00:00<00:00, 1190.10it/s]
+2026-04-03:16:37:59,838 INFO     [task.py:423] Building contexts for mmlu_high_school_psychology on rank 0...
+  0%|          | 0/545 [00:00<?, ?it/s] 22%|██▏       | 121/545 [00:00<00:00, 1200.06it/s] 44%|████▍     | 242/545 [00:00<00:00, 1191.97it/s] 66%|██████▋   | 362/545 [00:00<00:00, 1192.51it/s] 88%|████████▊ | 482/545 [00:00<00:00, 1191.55it/s]100%|██████████| 545/545 [00:00<00:00, 1191.37it/s]
+2026-04-03:16:38:00,309 INFO     [task.py:423] Building contexts for mmlu_us_foreign_policy on rank 0...
+  0%|          | 0/100 [00:00<?, ?it/s]100%|██████████| 100/100 [00:00<00:00, 1186.03it/s]
+2026-04-03:16:38:00,396 INFO     [task.py:423] Building contexts for mmlu_high_school_microeconomics on rank 0...
+  0%|          | 0/238 [00:00<?, ?it/s] 50%|█████     | 120/238 [00:00<00:00, 1191.20it/s]100%|██████████| 238/238 [00:00<00:00, 1190.68it/s]
+2026-04-03:16:38:00,602 INFO     [task.py:423] Building contexts for mmlu_sociology on rank 0...
+  0%|          | 0/201 [00:00<?, ?it/s] 60%|█████▉    | 120/201 [00:00<00:00, 1191.49it/s]100%|██████████| 201/201 [00:00<00:00, 1189.86it/s]
+2026-04-03:16:38:00,776 INFO     [task.py:423] Building contexts for mmlu_security_studies on rank 0...
+  0%|          | 0/245 [00:00<?, ?it/s] 49%|████▊     | 119/245 [00:00<00:00, 1188.33it/s] 97%|█████████▋| 238/245 [00:00<00:00, 1187.83it/s]100%|██████████| 245/245 [00:00<00:00, 1186.74it/s]
+2026-04-03:16:38:00,989 INFO     [task.py:423] Building contexts for mmlu_public_relations on rank 0...
+  0%|          | 0/110 [00:00<?, ?it/s]100%|██████████| 110/110 [00:00<00:00, 1199.83it/s]
+2026-04-03:16:38:01,083 INFO     [task.py:423] Building contexts for mmlu_human_sexuality on rank 0...
+  0%|          | 0/131 [00:00<?, ?it/s] 92%|█████████▏| 120/131 [00:00<00:00, 1198.52it/s]100%|██████████| 131/131 [00:00<00:00, 1196.95it/s]
+2026-04-03:16:38:01,196 INFO     [task.py:423] Building contexts for mmlu_high_school_government_and_politics on rank 0...
+  0%|          | 0/193 [00:00<?, ?it/s] 62%|██████▏   | 119/193 [00:00<00:00, 1188.54it/s]100%|██████████| 193/193 [00:00<00:00, 1188.85it/s]
+2026-04-03:16:38:01,364 INFO     [task.py:423] Building contexts for mmlu_econometrics on rank 0...
+  0%|          | 0/114 [00:00<?, ?it/s]100%|██████████| 114/114 [00:00<00:00, 1193.03it/s]
+2026-04-03:16:38:01,462 INFO     [task.py:423] Building contexts for mmlu_high_school_macroeconomics on rank 0...
+  0%|          | 0/390 [00:00<?, ?it/s] 31%|███       | 121/390 [00:00<00:00, 1200.51it/s] 62%|██████▏   | 242/390 [00:00<00:00, 1193.08it/s] 93%|█████████▎| 362/390 [00:00<00:00, 1192.32it/s]100%|██████████| 390/390 [00:00<00:00, 1191.50it/s]
+2026-04-03:16:38:01,799 INFO     [task.py:423] Building contexts for mmlu_formal_logic on rank 0...
+  0%|          | 0/126 [00:00<?, ?it/s] 94%|█████████▍| 119/126 [00:00<00:00, 1187.81it/s]100%|██████████| 126/126 [00:00<00:00, 1186.32it/s]
+2026-04-03:16:38:01,908 INFO     [task.py:423] Building contexts for mmlu_moral_disputes on rank 0...
+  0%|          | 0/346 [00:00<?, ?it/s] 34%|███▍      | 119/346 [00:00<00:00, 1186.30it/s] 69%|██████▉   | 239/346 [00:00<00:00, 1189.05it/s]100%|██████████| 346/346 [00:00<00:00, 1187.89it/s]
+2026-04-03:16:38:02,208 INFO     [task.py:423] Building contexts for mmlu_moral_scenarios on rank 0...
+  0%|          | 0/895 [00:00<?, ?it/s] 13%|█▎        | 120/895 [00:00<00:00, 1190.24it/s] 27%|██▋       | 240/895 [00:00<00:00, 1191.14it/s] 40%|████      | 360/895 [00:00<00:00, 1189.38it/s] 54%|█████▎    | 480/895 [00:00<00:00, 1190.82it/s] 67%|██████▋   | 600/895 [00:00<00:00, 1187.94it/s] 80%|████████  | 719/895 [00:00<00:00, 1187.12it/s] 94%|█████████▎| 839/895 [00:00<00:00, 1189.43it/s]100%|██████████| 895/895 [00:00<00:00, 1188.78it/s]
+2026-04-03:16:38:02,982 INFO     [task.py:423] Building contexts for mmlu_jurisprudence on rank 0...
+  0%|          | 0/108 [00:00<?, ?it/s]100%|██████████| 108/108 [00:00<00:00, 1196.02it/s]
+2026-04-03:16:38:03,075 INFO     [task.py:423] Building contexts for mmlu_high_school_european_history on rank 0...
+  0%|          | 0/165 [00:00<?, ?it/s] 59%|█████▉    | 97/165 [00:00<00:00, 316.18it/s]100%|██████████| 165/165 [00:00<00:00, 451.69it/s]
+2026-04-03:16:38:03,445 INFO     [task.py:423] Building contexts for mmlu_professional_law on rank 0...
+  0%|          | 0/1534 [00:00<?, ?it/s]  8%|▊         | 119/1534 [00:00<00:01, 1181.94it/s] 16%|█▌        | 238/1534 [00:00<00:01, 1183.71it/s] 23%|██▎       | 357/1534 [00:00<00:00, 1185.37it/s] 31%|███       | 477/1534 [00:00<00:00, 1188.98it/s] 39%|███▉      | 597/1534 [00:00<00:00, 1189.89it/s] 47%|████▋     | 717/1534 [00:00<00:00, 1192.61it/s] 55%|█████▍    | 837/1534 [00:00<00:00, 1191.78it/s] 62%|██████▏   | 957/1534 [00:00<00:00, 1192.38it/s] 70%|███████   | 1077/1534 [00:00<00:00, 1192.50it/s] 78%|███████▊  | 1197/1534 [00:01<00:00, 1190.77it/s] 86%|████████▌ | 1317/1534 [00:01<00:00, 1190.94it/s] 94%|█████████▎| 1437/1534 [00:01<00:00, 1190.67it/s]100%|██████████| 1534/1534 [00:01<00:00, 1190.33it/s]
+2026-04-03:16:38:04,772 INFO     [task.py:423] Building contexts for mmlu_prehistory on rank 0...
+  0%|          | 0/324 [00:00<?, ?it/s] 37%|███▋      | 119/324 [00:00<00:00, 1181.62it/s] 73%|███████▎  | 238/324 [00:00<00:00, 1184.06it/s]100%|██████████| 324/324 [00:00<00:00, 1184.38it/s]
+2026-04-03:16:38:05,054 INFO     [task.py:423] Building contexts for mmlu_world_religions on rank 0...
+  0%|          | 0/171 [00:00<?, ?it/s] 70%|███████   | 120/171 [00:00<00:00, 1191.27it/s]100%|██████████| 171/171 [00:00<00:00, 1191.67it/s]
+2026-04-03:16:38:05,202 INFO     [task.py:423] Building contexts for mmlu_high_school_world_history on rank 0...
+  0%|          | 0/237 [00:00<?, ?it/s] 50%|█████     | 119/237 [00:00<00:00, 1188.77it/s]100%|██████████| 237/237 [00:00<00:00, 1186.20it/s]
+2026-04-03:16:38:05,408 INFO     [task.py:423] Building contexts for mmlu_international_law on rank 0...
+  0%|          | 0/121 [00:00<?, ?it/s] 99%|█████████▉| 120/121 [00:00<00:00, 1193.52it/s]100%|██████████| 121/121 [00:00<00:00, 1191.33it/s]
+2026-04-03:16:38:05,513 INFO     [task.py:423] Building contexts for mmlu_logical_fallacies on rank 0...
+  0%|          | 0/163 [00:00<?, ?it/s] 74%|███████▍  | 121/163 [00:00<00:00, 1205.56it/s]100%|██████████| 163/163 [00:00<00:00, 1202.62it/s]
+2026-04-03:16:38:05,652 INFO     [task.py:423] Building contexts for mmlu_philosophy on rank 0...
+  0%|          | 0/311 [00:00<?, ?it/s] 39%|███▉      | 121/311 [00:00<00:00, 1201.64it/s] 78%|███████▊  | 242/311 [00:00<00:00, 1195.51it/s]100%|██████████| 311/311 [00:00<00:00, 1196.14it/s]
+2026-04-03:16:38:05,920 INFO     [task.py:423] Building contexts for mmlu_high_school_us_history on rank 0...
+  0%|          | 0/204 [00:00<?, ?it/s] 58%|█████▊    | 118/204 [00:00<00:00, 1174.86it/s]100%|██████████| 204/204 [00:00<00:00, 1177.55it/s]
+2026-04-03:16:38:06,099 INFO     [evaluator.py:465] Running loglikelihood requests
+Running loglikelihood requests:   0%|          | 0/56168 [00:00<?, ?it/s]We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
+Running loglikelihood requests:   0%|          | 1/56168 [00:01<16:39:24,  1.07s/it]Running loglikelihood requests:   0%|          | 65/56168 [00:01<18:32, 50.45it/s]  Running loglikelihood requests:   0%|          | 129/56168 [00:02<11:48, 79.12it/s]Running loglikelihood requests:   0%|          | 193/56168 [00:02<09:29, 98.35it/s]Running loglikelihood requests:   0%|          | 257/56168 [00:02<08:15, 112.82it/s]Running loglikelihood requests:   1%|          | 321/56168 [00:03<07:32, 123.51it/s]Running loglikelihood requests:   1%|          | 385/56168 [00:03<07:01, 132.40it/s]Running loglikelihood requests:   1%|          | 449/56168 [00:04<06:38, 139.96it/s]Running loglikelihood requests:   1%|          | 513/56168 [00:04<06:21, 145.87it/s]Running loglikelihood requests:   1%|          | 577/56168 [00:05<06:05, 152.14it/s]Running loglikelihood requests:   1%|          | 641/56168 [00:05<05:53, 156.97it/s]Running loglikelihood requests:   1%|▏         | 705/56168 [00:05<05:43, 161.51it/s]Running loglikelihood requests:   1%|▏         | 769/56168 [00:06<05:35, 165.10it/s]Running loglikelihood requests:   1%|▏         | 833/56168 [00:06<05:28, 168.64it/s]Running loglikelihood requests:   2%|▏         | 897/56168 [00:06<05:21, 171.88it/s]Running loglikelihood requests:   2%|▏         | 961/56168 [00:07<05:14, 175.63it/s]Running loglikelihood requests:   2%|▏         | 1025/56168 [00:07<05:08, 178.72it/s]Running loglikelihood requests:   2%|▏         | 1089/56168 [00:07<05:02, 182.32it/s]Running loglikelihood requests:   2%|▏         | 1153/56168 [00:08<04:58, 184.28it/s]Running loglikelihood requests:   2%|▏         | 1217/56168 [00:08<04:52, 187.93it/s]Running loglikelihood requests:   2%|▏         | 1281/56168 [00:08<04:49, 189.47it/s]Running loglikelihood requests:   2%|▏         | 1345/56168 [00:09<04:44, 192.65it/s]Running loglikelihood requests:   3%|▎         | 1409/56168 [00:09<04:39, 195.93it/s]Running loglikelihood requests:   3%|▎         | 1473/56168 [00:09<04:36, 197.88it/s]Running loglikelihood requests:   3%|▎         | 1537/56168 [00:10<04:33, 199.88it/s]Running loglikelihood requests:   3%|▎         | 1601/56168 [00:10<04:30, 201.83it/s]Running loglikelihood requests:   3%|▎         | 1665/56168 [00:10<04:27, 203.61it/s]Running loglikelihood requests:   3%|▎         | 1729/56168 [00:11<04:25, 205.33it/s]Running loglikelihood requests:   3%|▎         | 1793/56168 [00:11<04:22, 207.19it/s]Running loglikelihood requests:   3%|▎         | 1857/56168 [00:11<04:20, 208.44it/s]Running loglikelihood requests:   3%|▎         | 1921/56168 [00:11<04:17, 210.57it/s]Running loglikelihood requests:   4%|▎         | 1985/56168 [00:12<04:15, 212.47it/s]Running loglikelihood requests:   4%|▎         | 2049/56168 [00:12<04:12, 214.06it/s]Running loglikelihood requests:   4%|▍         | 2113/56168 [00:12<04:11, 214.75it/s]Running loglikelihood requests:   4%|▍         | 2177/56168 [00:13<04:06, 218.86it/s]Running loglikelihood requests:   4%|▍         | 2241/56168 [00:13<04:03, 221.92it/s]Running loglikelihood requests:   4%|▍         | 2305/56168 [00:13<04:00, 224.10it/s]Running loglikelihood requests:   4%|▍         | 2369/56168 [00:13<03:57, 226.43it/s]Running loglikelihood requests:   4%|▍         | 2433/56168 [00:14<03:55, 228.38it/s]Running loglikelihood requests:   4%|▍         | 2497/56168 [00:14<03:53, 229.57it/s]Running loglikelihood requests:   5%|▍         | 2561/56168 [00:14<03:53, 229.68it/s]Running loglikelihood requests:   5%|▍         | 2625/56168 [00:15<03:51, 231.29it/s]Running loglikelihood requests:   5%|▍         | 2689/56168 [00:15<03:48, 233.58it/s]Running loglikelihood requests:   5%|▍         | 2753/56168 [00:15<03:47, 234.42it/s]Running loglikelihood requests:   5%|▌         | 2817/56168 [00:15<03:46, 235.91it/s]Running loglikelihood requests:   5%|▌         | 2881/56168 [00:16<03:45, 236.51it/s]Running loglikelihood requests:   5%|▌         | 2945/56168 [00:16<03:43, 237.73it/s]Running loglikelihood requests:   5%|▌         | 3009/56168 [00:16<03:42, 238.55it/s]Running loglikelihood requests:   5%|▌         | 3073/56168 [00:16<03:40, 240.39it/s]Running loglikelihood requests:   6%|▌         | 3137/56168 [00:17<03:39, 241.82it/s]Running loglikelihood requests:   6%|▌         | 3201/56168 [00:17<03:37, 242.99it/s]Running loglikelihood requests:   6%|▌         | 3265/56168 [00:17<03:37, 243.76it/s]Running loglikelihood requests:   6%|▌         | 3329/56168 [00:17<03:36, 244.51it/s]Running loglikelihood requests:   6%|▌         | 3393/56168 [00:18<03:35, 245.23it/s]Running loglikelihood requests:   6%|▌         | 3457/56168 [00:18<03:34, 245.73it/s]Running loglikelihood requests:   6%|▋         | 3521/56168 [00:18<03:33, 246.30it/s]Running loglikelihood requests:   6%|▋         | 3585/56168 [00:19<03:33, 246.52it/s]Running loglikelihood requests:   6%|▋         | 3649/56168 [00:19<03:30, 249.09it/s]Running loglikelihood requests:   7%|▋         | 3713/56168 [00:19<03:28, 251.33it/s]Running loglikelihood requests:   7%|▋         | 3777/56168 [00:19<03:27, 253.04it/s]Running loglikelihood requests:   7%|▋         | 3841/56168 [00:19<03:25, 254.08it/s]Running loglikelihood requests:   7%|▋         | 3905/56168 [00:20<03:24, 255.29it/s]Running loglikelihood requests:   7%|▋         | 3969/56168 [00:20<03:23, 255.90it/s]Running loglikelihood requests:   7%|▋         | 4033/56168 [00:20<03:22, 256.88it/s]Running loglikelihood requests:   7%|▋         | 4097/56168 [00:20<03:19, 261.12it/s]Running loglikelihood requests:   7%|▋         | 4161/56168 [00:21<03:16, 265.16it/s]Running loglikelihood requests:   8%|▊         | 4225/56168 [00:21<03:13, 267.79it/s]Running loglikelihood requests:   8%|▊         | 4289/56168 [00:21<03:12, 269.68it/s]Running loglikelihood requests:   8%|▊         | 4353/56168 [00:21<03:10, 272.16it/s]Running loglikelihood requests:   8%|▊         | 4417/56168 [00:22<03:09, 273.57it/s]Running loglikelihood requests:   8%|▊         | 4481/56168 [00:22<03:08, 274.88it/s]Running loglikelihood requests:   8%|▊         | 4545/56168 [00:22<03:07, 275.62it/s]Running loglikelihood requests:   8%|▊         | 4609/56168 [00:22<03:06, 276.16it/s]Running loglikelihood requests:   8%|▊         | 4673/56168 [00:23<03:05, 277.72it/s]Running loglikelihood requests:   8%|▊         | 4737/56168 [00:23<03:04, 279.36it/s]Running loglikelihood requests:   9%|▊         | 4801/56168 [00:23<03:03, 280.24it/s]Running loglikelihood requests:   9%|▊         | 4865/56168 [00:23<03:02, 281.25it/s]Running loglikelihood requests:   9%|▉         | 4929/56168 [00:23<03:01, 282.10it/s]Running loglikelihood requests:   9%|▉         | 4993/56168 [00:24<03:01, 281.72it/s]Running loglikelihood requests:   9%|▉         | 5057/56168 [00:24<03:01, 282.14it/s]Running loglikelihood requests:   9%|▉         | 5121/56168 [00:24<02:59, 283.75it/s]Running loglikelihood requests:   9%|▉         | 5185/56168 [00:24<02:59, 284.69it/s]Running loglikelihood requests:   9%|▉         | 5249/56168 [00:25<02:57, 287.34it/s]Running loglikelihood requests:   9%|▉         | 5313/56168 [00:25<02:56, 288.86it/s]Running loglikelihood requests:  10%|▉         | 5377/56168 [00:25<02:54, 291.25it/s]Running loglikelihood requests:  10%|▉         | 5441/56168 [00:25<02:52, 293.59it/s]Running loglikelihood requests:  10%|▉         | 5505/56168 [00:25<02:51, 295.06it/s]Running loglikelihood requests:  10%|▉         | 5569/56168 [00:26<02:51, 295.88it/s]Running loglikelihood requests:  10%|█         | 5633/56168 [00:26<02:50, 296.84it/s]Running loglikelihood requests:  10%|█         | 5697/56168 [00:26<02:49, 297.28it/s]Running loglikelihood requests:  10%|█         | 5761/56168 [00:26<02:49, 297.85it/s]Running loglikelihood requests:  10%|█         | 5825/56168 [00:27<02:48, 298.81it/s]Running loglikelihood requests:  10%|█         | 5889/56168 [00:27<02:46, 302.12it/s]Running loglikelihood requests:  11%|█         | 5953/56168 [00:27<02:45, 304.29it/s]Running loglikelihood requests:  11%|█         | 6017/56168 [00:27<02:44, 305.50it/s]Running loglikelihood requests:  11%|█         | 6081/56168 [00:27<02:43, 307.23it/s]Running loglikelihood requests:  11%|█         | 6145/56168 [00:28<02:42, 308.55it/s]Running loglikelihood requests:  11%|█         | 6209/56168 [00:28<02:43, 306.35it/s]Running loglikelihood requests:  11%|█         | 6273/56168 [00:28<02:41, 308.87it/s]Running loglikelihood requests:  11%|█▏        | 6337/56168 [00:28<02:40, 310.66it/s]Running loglikelihood requests:  11%|█▏        | 6401/56168 [00:28<02:39, 311.50it/s]Running loglikelihood requests:  12%|█▏        | 6465/56168 [00:29<02:39, 312.29it/s]Running loglikelihood requests:  12%|█▏        | 6529/56168 [00:29<02:38, 313.66it/s]Running loglikelihood requests:  12%|█▏        | 6593/56168 [00:29<02:37, 315.02it/s]Running loglikelihood requests:  12%|█▏        | 6657/56168 [00:29<02:36, 315.88it/s]Running loglikelihood requests:  12%|█▏        | 6721/56168 [00:29<02:32, 323.83it/s]Running loglikelihood requests:  12%|█▏        | 6785/56168 [00:30<02:30, 328.42it/s]Running loglikelihood requests:  12%|█▏        | 6849/56168 [00:30<02:28, 331.72it/s]Running loglikelihood requests:  12%|█▏        | 6913/56168 [00:30<02:27, 334.33it/s]Running loglikelihood requests:  12%|█▏        | 6977/56168 [00:30<02:26, 335.32it/s]Running loglikelihood requests:  13%|█▎        | 7041/56168 [00:30<02:25, 337.54it/s]Running loglikelihood requests:  13%|█▎        | 7105/56168 [00:30<02:24, 338.73it/s]Running loglikelihood requests:  13%|█▎        | 7169/56168 [00:31<02:24, 339.93it/s]Running loglikelihood requests:  13%|█▎        | 7233/56168 [00:31<02:23, 341.61it/s]Running loglikelihood requests:  13%|█▎        | 7297/56168 [00:31<02:22, 342.35it/s]Running loglikelihood requests:  13%|█▎        | 7361/56168 [00:31<02:22, 343.01it/s]Running loglikelihood requests:  13%|█▎        | 7425/56168 [00:31<02:21, 344.28it/s]Running loglikelihood requests:  13%|█▎        | 7489/56168 [00:32<02:20, 345.46it/s]Running loglikelihood requests:  13%|█▎        | 7553/56168 [00:32<02:19, 348.02it/s]Running loglikelihood requests:  14%|█▎        | 7617/56168 [00:32<02:18, 350.25it/s]Running loglikelihood requests:  14%|█▎        | 7681/56168 [00:32<02:17, 352.31it/s]Running loglikelihood requests:  14%|█▍        | 7745/56168 [00:32<02:16, 354.28it/s]Running loglikelihood requests:  14%|█▍        | 7809/56168 [00:32<02:15, 356.08it/s]Running loglikelihood requests:  14%|█▍        | 7873/56168 [00:33<02:15, 357.69it/s]Running loglikelihood requests:  14%|█▍        | 7937/56168 [00:33<02:14, 358.91it/s]Running loglikelihood requests:  14%|█▍        | 8001/56168 [00:33<02:13, 359.72it/s]Running loglikelihood requests:  14%|█▍        | 8065/56168 [00:33<02:13, 360.44it/s]Running loglikelihood requests:  14%|█▍        | 8129/56168 [00:33<02:13, 360.89it/s]Running loglikelihood requests:  15%|█▍        | 8193/56168 [00:34<02:11, 363.94it/s]Running loglikelihood requests:  15%|█▍        | 8257/56168 [00:34<02:10, 367.19it/s]Running loglikelihood requests:  15%|█▍        | 8321/56168 [00:34<02:09, 368.52it/s]Running loglikelihood requests:  15%|█▍        | 8385/56168 [00:34<02:08, 370.95it/s]Running loglikelihood requests:  15%|█▌        | 8449/56168 [00:34<02:08, 372.00it/s]Running loglikelihood requests:  15%|█▌        | 8513/56168 [00:34<02:07, 373.46it/s]Running loglikelihood requests:  15%|█▌        | 8577/56168 [00:35<02:07, 374.18it/s]Running loglikelihood requests:  15%|█▌        | 8641/56168 [00:35<02:06, 375.41it/s]Running loglikelihood requests:  15%|█▌        | 8705/56168 [00:35<02:06, 375.84it/s]Running loglikelihood requests:  16%|█▌        | 8769/56168 [00:35<02:05, 378.20it/s]Running loglikelihood requests:  16%|█▌        | 8833/56168 [00:35<02:04, 378.96it/s]Running loglikelihood requests:  16%|█▌        | 8897/56168 [00:35<02:04, 380.68it/s]Running loglikelihood requests:  16%|█▌        | 8961/56168 [00:36<02:03, 381.59it/s]Running loglikelihood requests:  16%|█▌        | 9025/56168 [00:36<02:02, 384.85it/s]Running loglikelihood requests:  16%|█▌        | 9089/56168 [00:36<02:01, 388.13it/s]Running loglikelihood requests:  16%|█▋        | 9153/56168 [00:36<02:00, 391.06it/s]Running loglikelihood requests:  16%|█▋        | 9217/56168 [00:36<01:59, 394.06it/s]Running loglikelihood requests:  17%|█▋        | 9281/56168 [00:36<01:58, 396.43it/s]Running loglikelihood requests:  17%|█▋        | 9345/56168 [00:37<01:57, 398.24it/s]Running loglikelihood requests:  17%|█▋        | 9409/56168 [00:37<01:56, 400.10it/s]Running loglikelihood requests:  17%|█▋        | 9473/56168 [00:37<01:56, 401.15it/s]Running loglikelihood requests:  17%|█▋        | 9537/56168 [00:37<01:56, 401.96it/s]Running loglikelihood requests:  17%|█▋        | 9601/56168 [00:37<01:55, 402.61it/s]Running loglikelihood requests:  17%|█▋        | 9665/56168 [00:37<01:55, 404.16it/s]Running loglikelihood requests:  17%|█▋        | 9729/56168 [00:37<01:54, 406.12it/s]Running loglikelihood requests:  17%|█▋        | 9793/56168 [00:38<01:53, 408.03it/s]Running loglikelihood requests:  18%|█▊        | 9857/56168 [00:38<01:50, 417.88it/s]Running loglikelihood requests:  18%|█▊        | 9921/56168 [00:38<01:48, 424.88it/s]Running loglikelihood requests:  18%|█▊        | 9985/56168 [00:38<01:47, 430.87it/s]Running loglikelihood requests:  18%|█▊        | 10049/56168 [00:38<01:46, 433.66it/s]Running loglikelihood requests:  18%|█▊        | 10113/56168 [00:38<01:45, 436.69it/s]Running loglikelihood requests:  18%|█▊        | 10177/56168 [00:39<01:44, 439.13it/s]Running loglikelihood requests:  18%|█▊        | 10241/56168 [00:39<01:43, 441.95it/s]Running loglikelihood requests:  18%|█▊        | 10305/56168 [00:39<01:43, 442.74it/s]Running loglikelihood requests:  18%|█▊        | 10369/56168 [00:39<01:43, 442.15it/s]Running loglikelihood requests:  19%|█▊        | 10433/56168 [00:39<01:43, 442.86it/s]Running loglikelihood requests:  19%|█▊        | 10497/56168 [00:39<01:42, 443.64it/s]Running loglikelihood requests:  19%|█▉        | 10561/56168 [00:39<01:42, 445.49it/s]Running loglikelihood requests:  19%|█▉        | 10625/56168 [00:40<01:41, 446.51it/s]Running loglikelihood requests:  19%|█▉        | 10689/56168 [00:40<01:40, 450.46it/s]Running loglikelihood requests:  19%|█▉        | 10753/56168 [00:40<01:40, 452.07it/s]Running loglikelihood requests:  19%|█▉        | 10817/56168 [00:40<01:39, 457.77it/s]Running loglikelihood requests:  19%|█▉        | 10881/56168 [00:40<01:37, 462.47it/s]Running loglikelihood requests:  19%|█▉        | 10945/56168 [00:40<01:37, 464.88it/s]Running loglikelihood requests:  20%|█▉        | 11009/56168 [00:40<01:36, 465.88it/s]Running loglikelihood requests:  20%|█▉        | 11073/56168 [00:40<01:36, 467.75it/s]Running loglikelihood requests:  20%|█▉        | 11137/56168 [00:41<01:36, 467.46it/s]Running loglikelihood requests:  20%|█▉        | 11201/56168 [00:41<01:35, 469.75it/s]Running loglikelihood requests:  20%|██        | 11265/56168 [00:41<01:35, 470.90it/s]Running loglikelihood requests:  20%|██        | 11329/56168 [00:41<01:34, 472.05it/s]Running loglikelihood requests:  20%|██        | 11393/56168 [00:41<01:34, 473.01it/s]Running loglikelihood requests:  20%|██        | 11457/56168 [00:41<01:34, 473.98it/s]Running loglikelihood requests:  21%|██        | 11521/56168 [00:41<01:34, 474.18it/s]Running loglikelihood requests:  21%|██        | 11585/56168 [00:42<01:34, 473.78it/s]Running loglikelihood requests:  21%|██        | 11649/56168 [00:42<01:33, 474.26it/s]Running loglikelihood requests:  21%|██        | 11713/56168 [00:42<01:33, 476.23it/s]Running loglikelihood requests:  21%|██        | 11777/56168 [00:42<01:33, 476.85it/s]Running loglikelihood requests:  21%|██        | 11841/56168 [00:42<01:32, 477.72it/s]Running loglikelihood requests:  21%|██        | 11909/56168 [00:42<01:30, 487.06it/s]Running loglikelihood requests:  21%|██▏       | 11973/56168 [00:42<01:31, 485.07it/s]Running loglikelihood requests:  21%|██��       | 12037/56168 [00:42<01:30, 490.30it/s]Running loglikelihood requests:  22%|██▏       | 12101/56168 [00:43<01:29, 493.88it/s]Running loglikelihood requests:  22%|██▏       | 12165/56168 [00:43<01:28, 497.46it/s]Running loglikelihood requests:  22%|██▏       | 12229/56168 [00:43<01:27, 499.91it/s]Running loglikelihood requests:  22%|██▏       | 12293/56168 [00:43<01:27, 501.32it/s]Running loglikelihood requests:  22%|██▏       | 12357/56168 [00:43<01:26, 504.28it/s]Running loglikelihood requests:  22%|██▏       | 12421/56168 [00:43<01:26, 506.45it/s]Running loglikelihood requests:  22%|██▏       | 12485/56168 [00:43<01:26, 506.91it/s]Running loglikelihood requests:  22%|██▏       | 12549/56168 [00:43<01:25, 507.81it/s]Running loglikelihood requests:  22%|██▏       | 12613/56168 [00:44<01:25, 509.37it/s]Running loglikelihood requests:  23%|██▎       | 12677/56168 [00:44<01:25, 509.98it/s]Running loglikelihood requests:  23%|██▎       | 12741/56168 [00:44<01:25, 510.84it/s]Running loglikelihood requests:  23%|██▎       | 12805/56168 [00:44<01:24, 510.62it/s]Running loglikelihood requests:  23%|██▎       | 12869/56168 [00:44<01:24, 512.37it/s]Running loglikelihood requests:  23%|██▎       | 12933/56168 [00:44<01:24, 513.35it/s]Running loglikelihood requests:  23%|██▎       | 12997/56168 [00:44<01:24, 513.13it/s]Running loglikelihood requests:  23%|██▎       | 13061/56168 [00:44<01:24, 512.71it/s]Running loglikelihood requests:  23%|██▎       | 13125/56168 [00:45<01:23, 513.88it/s]Running loglikelihood requests:  23%|██▎       | 13189/56168 [00:45<01:23, 514.65it/s]Running loglikelihood requests:  24%|██▎       | 13253/56168 [00:45<01:23, 514.41it/s]Running loglikelihood requests:  24%|██▎       | 13317/56168 [00:45<01:23, 514.12it/s]Running loglikelihood requests:  24%|██▍       | 13381/56168 [00:45<01:22, 516.30it/s]Running loglikelihood requests:  24%|██▍       | 13445/56168 [00:45<01:22, 517.73it/s]Running loglikelihood requests:  24%|██▍       | 13509/56168 [00:45<01:22, 518.59it/s]Running loglikelihood requests:  24%|██▍       | 13573/56168 [00:45<01:21, 519.82it/s]Running loglikelihood requests:  24%|██▍       | 13637/56168 [00:46<01:21, 520.31it/s]Running loglikelihood requests:  24%|██▍       | 13701/56168 [00:46<01:21, 521.59it/s]Running loglikelihood requests:  25%|██▍       | 13765/56168 [00:46<01:21, 522.62it/s]Running loglikelihood requests:  25%|██▍       | 13829/56168 [00:46<01:21, 522.62it/s]Running loglikelihood requests:  25%|██▍       | 13893/56168 [00:46<01:20, 523.36it/s]Running loglikelihood requests:  25%|██▍       | 13957/56168 [00:46<01:20, 524.81it/s]Running loglikelihood requests:  25%|██▍       | 14021/56168 [00:46<01:18, 535.11it/s]Running loglikelihood requests:  25%|██▌       | 14085/56168 [00:46<01:17, 543.72it/s]Running loglikelihood requests:  25%|██▌       | 14149/56168 [00:47<01:16, 548.69it/s]Running loglikelihood requests:  25%|██▌       | 14213/56168 [00:47<01:15, 553.24it/s]Running loglikelihood requests:  25%|██▌       | 14277/56168 [00:47<01:15, 554.69it/s]Running loglikelihood requests:  26%|██▌       | 14341/56168 [00:47<01:15, 555.34it/s]Running loglikelihood requests:  26%|██▌       | 14405/56168 [00:47<01:14, 558.36it/s]Running loglikelihood requests:  26%|██▌       | 14469/56168 [00:47<01:14, 561.01it/s]Running loglikelihood requests:  26%|██▌       | 14533/56168 [00:47<01:14, 560.88it/s]Running loglikelihood requests:  26%|██▌       | 14597/56168 [00:47<01:13, 562.33it/s]Running loglikelihood requests:  26%|██▌       | 14661/56168 [00:47<01:13, 563.45it/s]Running loglikelihood requests:  26%|██▌       | 14725/56168 [00:48<01:13, 563.84it/s]Running loglikelihood requests:  26%|██▋       | 14789/56168 [00:48<01:13, 564.70it/s]Running loglikelihood requests:  26%|██▋       | 14853/56168 [00:48<01:13, 565.71it/s]Running loglikelihood requests:  27%|██▋       | 14917/56168 [00:48<01:12, 566.31it/s]Running loglikelihood requests:  27%|██▋       | 14981/56168 [00:48<01:12, 566.67it/s]Running loglikelihood requests:  27%|██▋       | 15045/56168 [00:48<01:12, 566.69it/s]Running loglikelihood requests:  27%|██▋       | 15109/56168 [00:48<01:12, 564.55it/s]Running loglikelihood requests:  27%|██▋       | 15173/56168 [00:48<01:12, 567.34it/s]Running loglikelihood requests:  27%|██▋       | 15237/56168 [00:48<01:12, 566.77it/s]Running loglikelihood requests:  27%|██▋       | 15301/56168 [00:49<01:12, 566.24it/s]Running loglikelihood requests:  27%|██▋       | 15365/56168 [00:49<01:11, 568.40it/s]Running loglikelihood requests:  27%|██▋       | 15429/56168 [00:49<01:11, 567.08it/s]Running loglikelihood requests:  28%|██▊       | 15493/56168 [00:49<01:11, 567.89it/s]Running loglikelihood requests:  28%|██▊       | 15557/56168 [00:49<01:11, 569.00it/s]Running loglikelihood requests:  28%|██▊       | 15621/56168 [00:49<01:11, 567.93it/s]Running loglikelihood requests:  28%|██▊       | 15685/56168 [00:49<01:11, 568.91it/s]Running loglikelihood requests:  28%|██▊       | 15749/56168 [00:49<01:10, 569.49it/s]Running loglikelihood requests:  28%|██▊       | 15813/56168 [00:49<01:10, 569.58it/s]Running loglikelihood requests:  28%|██▊       | 15877/56168 [00:50<01:10, 570.61it/s]Running loglikelihood requests:  28%|██▊       | 15941/56168 [00:50<01:10, 569.72it/s]Running loglikelihood requests:  28%|██▊       | 16005/56168 [00:50<01:10, 570.14it/s]Running loglikelihood requests:  29%|██▊       | 16069/56168 [00:50<01:10, 569.00it/s]Running loglikelihood requests:  29%|██▊       | 16133/56168 [00:50<01:10, 567.09it/s]Running loglikelihood requests:  29%|██▉       | 16197/56168 [00:50<01:10, 567.70it/s]Running loglikelihood requests:  29%|██▉       | 16261/56168 [00:50<01:10, 567.06it/s]Running loglikelihood requests:  29%|██▉       | 16325/56168 [00:50<01:10, 566.18it/s]Running loglikelihood requests:  29%|██▉       | 16389/56168 [00:51<01:10, 568.22it/s]Running loglikelihood requests:  29%|██▉       | 16453/56168 [00:51<01:09, 568.73it/s]Running loglikelihood requests:  29%|██▉       | 16517/56168 [00:51<01:09, 568.30it/s]Running loglikelihood requests:  30%|██▉       | 16581/56168 [00:51<01:09, 568.65it/s]Running loglikelihood requests:  30%|██▉       | 16645/56168 [00:51<01:09, 568.86it/s]Running loglikelihood requests:  30%|██▉       | 16709/56168 [00:51<01:09, 569.88it/s]Running loglikelihood requests:  30%|██▉       | 16773/56168 [00:51<01:09, 569.79it/s]Running loglikelihood requests:  30%|██▉       | 16837/56168 [00:51<01:09, 568.06it/s]Running loglikelihood requests:  30%|███       | 16901/56168 [00:51<01:09, 569.01it/s]Running loglikelihood requests:  30%|███       | 16965/56168 [00:52<01:08, 568.73it/s]Running loglikelihood requests:  30%|███       | 17029/56168 [00:52<01:08, 569.07it/s]Running loglikelihood requests:  30%|███       | 17093/56168 [00:52<01:08, 569.72it/s]Running loglikelihood requests:  31%|███       | 17157/56168 [00:52<01:08, 569.27it/s]Running loglikelihood requests:  31%|███       | 17221/56168 [00:52<01:08, 570.19it/s]Running loglikelihood requests:  31%|███       | 17285/56168 [00:52<01:08, 570.32it/s]Running loglikelihood requests:  31%|███       | 17349/56168 [00:52<01:08, 569.54it/s]Running loglikelihood requests:  31%|███       | 17413/56168 [00:52<01:07, 570.74it/s]Running loglikelihood requests:  31%|███       | 17477/56168 [00:52<01:07, 570.90it/s]Running loglikelihood requests:  31%|███       | 17541/56168 [00:53<01:07, 571.46it/s]Running loglikelihood requests:  31%|███▏      | 17605/56168 [00:53<01:07, 571.23it/s]Running loglikelihood requests:  31%|███▏      | 17669/56168 [00:53<01:07, 570.47it/s]Running loglikelihood requests:  32%|███▏      | 17733/56168 [00:53<01:07, 571.50it/s]Running loglikelihood requests:  32%|███▏      | 17797/56168 [00:53<01:07, 571.23it/s]Running loglikelihood requests:  32%|███▏      | 17861/56168 [00:53<01:07, 571.24it/s]Running loglikelihood requests:  32%|███▏      | 17925/56168 [00:53<01:06, 571.91it/s]Running loglikelihood requests:  32%|███▏      | 17989/56168 [00:53<01:06, 572.21it/s]Running loglikelihood requests:  32%|███▏      | 18053/56168 [00:53<01:06, 573.09it/s]Running loglikelihood requests:  32%|███▏      | 18117/56168 [00:54<01:06, 572.33it/s]Running loglikelihood requests:  32%|███▏      | 18181/56168 [00:54<01:06, 572.68it/s]Running loglikelihood requests:  32%|███▏      | 18245/56168 [00:54<01:06, 572.05it/s]Running loglikelihood requests:  33%|███▎      | 18309/56168 [00:54<01:06, 571.79it/s]Running loglikelihood requests:  33%|███▎      | 18373/56168 [00:54<01:06, 571.43it/s]Running loglikelihood requests:  33%|███▎      | 18437/56168 [00:54<01:06, 571.12it/s]Running loglikelihood requests:  33%|███▎      | 18501/56168 [00:54<01:05, 572.29it/s]Running loglikelihood requests:  33%|███▎      | 18565/56168 [00:54<01:05, 572.12it/s]Running loglikelihood requests:  33%|███▎      | 18629/56168 [00:54<01:05, 573.06it/s]Running loglikelihood requests:  33%|███▎      | 18697/56168 [00:55<01:04, 585.09it/s]Running loglikelihood requests:  33%|███▎      | 18761/56168 [00:55<01:04, 581.17it/s]Running loglikelihood requests:  34%|███▎      | 18825/56168 [00:55<01:04, 579.75it/s]Running loglikelihood requests:  34%|███▎      | 18889/56168 [00:55<01:04, 577.21it/s]Running loglikelihood requests:  34%|███▍      | 18967/56168 [00:55<00:58, 633.48it/s]Running loglikelihood requests:  34%|███▍      | 19043/56168 [00:55<00:55, 669.33it/s]Running loglikelihood requests:  34%|███▍      | 19128/56168 [00:55<00:51, 721.43it/s]Running loglikelihood requests:  34%|███▍      | 19209/56168 [00:55<01:03, 586.31it/s]Running loglikelihood requests:  34%|███▍      | 19293/56168 [00:55<00:56, 648.82it/s]Running loglikelihood requests:  35%|███▍      | 19382/56168 [00:56<00:51, 711.51it/s]Running loglikelihood requests:  35%|███▍      | 19465/56168 [00:56<01:01, 593.81it/s]Running loglikelihood requests:  35%|███▍      | 19551/56168 [00:56<00:55, 656.67it/s]Running loglikelihood requests:  35%|███▍      | 19623/56168 [00:56<00:54, 672.52it/s]Running loglikelihood requests:  35%|███▌      | 19698/56168 [00:56<00:52, 693.04it/s]Running loglikelihood requests:  35%|███▌      | 19775/56168 [00:56<00:50, 714.13it/s]Running loglikelihood requests:  35%|███▌      | 19853/56168 [00:56<01:02, 580.62it/s]Running loglikelihood requests:  35%|███▌      | 19938/56168 [00:56<00:56, 645.71it/s]Running loglikelihood requests:  36%|███▌      | 20040/56168 [00:57<00:48, 741.75it/s]Running loglikelihood requests:  36%|███▌      | 20120/56168 [00:57<00:59, 608.55it/s]Running loglikelihood requests:  36%|███▌      | 20215/56168 [00:57<00:52, 688.94it/s]Running loglikelihood requests:  36%|███▌      | 20305/56168 [00:57<00:59, 601.15it/s]Running loglikelihood requests:  36%|███▋      | 20386/56168 [00:57<00:55, 648.26it/s]Running loglikelihood requests:  36%|███▋      | 20477/56168 [00:57<00:50, 712.00it/s]Running loglikelihood requests:  37%|███▋      | 20561/56168 [00:57<00:59, 600.84it/s]Running loglikelihood requests:  37%|███▋      | 20652/56168 [00:58<00:52, 671.71it/s]Running loglikelihood requests:  37%|███▋      | 20750/56168 [00:58<00:47, 747.70it/s]Running loglikelihood requests:  37%|███▋      | 20832/56168 [00:58<00:57, 618.11it/s]Running loglikelihood requests:  37%|███▋      | 20931/56168 [00:58<00:50, 703.97it/s]Running loglikelihood requests:  37%|███▋      | 21013/56168 [00:58<00:59, 595.77it/s]Running loglikelihood requests:  38%|███▊      | 21099/56168 [00:58<00:53, 654.79it/s]Running loglikelihood requests:  38%|███▊      | 21195/56168 [00:58<00:47, 728.67it/s]Running loglikelihood requests:  38%|███▊      | 21276/56168 [00:59<00:57, 607.36it/s]Running loglikelihood requests:  38%|███▊      | 21376/56168 [00:59<00:49, 697.36it/s]Running loglikelihood requests:  38%|███▊      | 21469/56168 [00:59<00:56, 616.46it/s]Running loglikelihood requests:  38%|███▊      | 21566/56168 [00:59<00:49, 695.40it/s]Running loglikelihood requests:  39%|███▊      | 21661/56168 [00:59<00:55, 621.42it/s]Running loglikelihood requests:  39%|███▉      | 21772/56168 [00:59<00:47, 729.41it/s]Running loglikelihood requests:  39%|███▉      | 21857/56168 [00:59<00:55, 622.22it/s]Running loglikelihood requests:  39%|███▉      | 21961/56168 [00:59<00:47, 713.81it/s]Running loglikelihood requests:  39%|███▉      | 22049/56168 [01:00<00:54, 620.38it/s]Running loglikelihood requests:  39%|███▉      | 22165/56168 [01:00<00:46, 738.62it/s]Running loglikelihood requests:  40%|███▉      | 22250/56168 [01:00<00:53, 630.78it/s]Running loglikelihood requests:  40%|███▉      | 22361/56168 [01:00<00:45, 735.76it/s]Running loglikelihood requests:  40%|███▉      | 22445/56168 [01:00<00:53, 629.38it/s]Running loglikelihood requests:  40%|████      | 22561/56168 [01:00<00:54, 622.07it/s]Running loglikelihood requests:  40%|████      | 22681/56168 [01:01<00:45, 741.97it/s]Running loglikelihood requests:  41%|████      | 22766/56168 [01:01<00:52, 638.74it/s]Running loglikelihood requests:  41%|████      | 22881/56168 [01:01<00:53, 626.11it/s]Running loglikelihood requests:  41%|████      | 23009/56168 [01:01<00:51, 641.57it/s]Running loglikelihood requests:  41%|████      | 23137/56168 [01:01<00:50, 651.31it/s]Running loglikelihood requests:  41%|████▏     | 23262/56168 [01:01<00:42, 766.84it/s]Running loglikelihood requests:  42%|████▏     | 23348/56168 [01:02<00:49, 663.00it/s]Running loglikelihood requests:  42%|████▏     | 23457/56168 [01:02<00:51, 634.07it/s]Running loglikelihood requests:  42%|████▏     | 23585/56168 [01:02<00:50, 650.50it/s]Running loglikelihood requests:  42%|████▏     | 23713/56168 [01:02<00:48, 664.96it/s]Running loglikelihood requests:  42%|████▏     | 23841/56168 [01:02<00:47, 676.36it/s]Running loglikelihood requests:  43%|████▎     | 23969/56168 [01:03<00:47, 684.46it/s]Running loglikelihood requests:  43%|████▎     | 24097/56168 [01:03<00:46, 689.40it/s]Running loglikelihood requests:  43%|████▎     | 24225/56168 [01:03<00:46, 693.95it/s]Running loglikelihood requests:  43%|████▎     | 24353/56168 [01:03<00:45, 696.78it/s]Running loglikelihood requests:  44%|████▎     | 24481/56168 [01:03<00:45, 699.35it/s]Running loglikelihood requests:  44%|████▍     | 24609/56168 [01:03<00:44, 702.46it/s]Running loglikelihood requests:  44%|████▍     | 24737/56168 [01:04<00:44, 704.30it/s]Running loglikelihood requests:  44%|████▍     | 24865/56168 [01:04<00:44, 706.57it/s]Running loglikelihood requests:  44%|████▍     | 24993/56168 [01:04<00:44, 707.46it/s]Running loglikelihood requests:  45%|████▍     | 25125/56168 [01:04<00:43, 715.49it/s]Running loglikelihood requests:  45%|████▍     | 25253/56168 [01:04<00:43, 715.16it/s]Running loglikelihood requests:  45%|████▌     | 25381/56168 [01:04<00:43, 714.75it/s]Running loglikelihood requests:  45%|████▌     | 25509/56168 [01:05<00:42, 715.00it/s]Running loglikelihood requests:  46%|████▌     | 25637/56168 [01:05<00:42, 715.50it/s]Running loglikelihood requests:  46%|████▌     | 25765/56168 [01:05<00:42, 717.99it/s]Running loglikelihood requests:  46%|████▌     | 25893/56168 [01:05<00:42, 720.01it/s]Running loglikelihood requests:  46%|████▋     | 26021/56168 [01:05<00:41, 720.99it/s]Running loglikelihood requests:  47%|████▋     | 26149/56168 [01:06<00:41, 720.77it/s]Running loglikelihood requests:  47%|████▋     | 26277/56168 [01:06<00:41, 719.89it/s]Running loglikelihood requests:  47%|████▋     | 26405/56168 [01:06<00:41, 719.18it/s]Running loglikelihood requests:  47%|████▋     | 26533/56168 [01:06<00:41, 719.47it/s]Running loglikelihood requests:  47%|████▋     | 26661/56168 [01:06<00:40, 719.74it/s]Running loglikelihood requests:  48%|████▊     | 26789/56168 [01:06<00:40, 719.75it/s]Running loglikelihood requests:  48%|████▊     | 26917/56168 [01:07<00:40, 720.48it/s]Running loglikelihood requests:  48%|████▊     | 27045/56168 [01:07<00:40, 721.68it/s]Running loglikelihood requests:  48%|████▊     | 27173/56168 [01:07<00:40, 722.43it/s]Running loglikelihood requests:  49%|████▊     | 27301/56168 [01:07<00:39, 722.81it/s]Running loglikelihood requests:  49%|████▉     | 27429/56168 [01:07<00:39, 721.90it/s]Running loglikelihood requests:  49%|████▉     | 27557/56168 [01:08<00:39, 723.27it/s]Running loglikelihood requests:  49%|████▉     | 27685/56168 [01:08<00:39, 724.80it/s]Running loglikelihood requests:  50%|████▉     | 27813/56168 [01:08<00:39, 726.03it/s]Running loglikelihood requests:  50%|████▉     | 27941/56168 [01:08<00:38, 726.67it/s]Running loglikelihood requests:  50%|████▉     | 28069/56168 [01:08<00:38, 726.60it/s]Running loglikelihood requests:  50%|█████     | 28197/56168 [01:08<00:38, 728.15it/s]Running loglikelihood requests:  50%|█████     | 28329/56168 [01:09<00:37, 736.07it/s]Running loglikelihood requests:  51%|█████     | 28457/56168 [01:09<00:37, 734.52it/s]Running loglikelihood requests:  51%|█████     | 28585/56168 [01:09<00:37, 732.34it/s]Running loglikelihood requests:  51%|█████     | 28713/56168 [01:09<00:37, 731.45it/s]Running loglikelihood requests:  51%|█████▏    | 28841/56168 [01:09<00:37, 731.69it/s]Running loglikelihood requests:  52%|█████▏    | 28969/56168 [01:09<00:37, 732.33it/s]Running loglikelihood requests:  52%|█████▏    | 29097/56168 [01:10<00:36, 731.81it/s]Running loglikelihood requests:  52%|█████▏    | 29225/56168 [01:10<00:36, 738.39it/s]Running loglikelihood requests:  52%|█████▏    | 29353/56168 [01:10<00:35, 750.00it/s]Running loglikelihood requests:  52%|█████▏    | 29485/56168 [01:10<00:34, 767.27it/s]Running loglikelihood requests:  53%|█████▎    | 29613/56168 [01:10<00:34, 772.81it/s]Running loglikelihood requests:  53%|█████▎    | 29741/56168 [01:10<00:34, 775.62it/s]Running loglikelihood requests:  53%|█████▎    | 29869/56168 [01:11<00:33, 776.97it/s]Running loglikelihood requests:  53%|█████▎    | 29997/56168 [01:11<00:33, 779.36it/s]Running loglikelihood requests:  54%|█████▎    | 30125/56168 [01:11<00:33, 781.83it/s]Running loglikelihood requests:  54%|█████▍    | 30253/56168 [01:11<00:33, 784.35it/s]Running loglikelihood requests:  54%|█████▍    | 30381/56168 [01:11<00:32, 785.78it/s]Running loglikelihood requests:  54%|█████▍    | 30509/56168 [01:11<00:32, 787.55it/s]Running loglikelihood requests:  55%|█████▍    | 30637/56168 [01:12<00:32, 789.07it/s]Running loglikelihood requests:  55%|█████▍    | 30765/56168 [01:12<00:32, 788.35it/s]Running loglikelihood requests:  55%|█████▌    | 30897/56168 [01:12<00:31, 795.14it/s]Running loglikelihood requests:  55%|█████▌    | 31025/56168 [01:12<00:31, 794.43it/s]Running loglikelihood requests:  55%|█████▌    | 31153/56168 [01:12<00:31, 793.93it/s]Running loglikelihood requests:  56%|█████▌    | 31281/56168 [01:12<00:31, 793.32it/s]Running loglikelihood requests:  56%|█████▌    | 31409/56168 [01:13<00:31, 792.94it/s]Running loglikelihood requests:  56%|█████▌    | 31537/56168 [01:13<00:31, 793.87it/s]Running loglikelihood requests:  56%|█████▋    | 31665/56168 [01:13<00:30, 794.94it/s]Running loglikelihood requests:  57%|█████▋    | 31793/56168 [01:13<00:30, 794.66it/s]Running loglikelihood requests:  57%|█████▋    | 31925/56168 [01:13<00:30, 802.98it/s]Running loglikelihood requests:  57%|█████▋    | 32053/56168 [01:13<00:30, 801.48it/s]Running loglikelihood requests:  57%|█████▋    | 32181/56168 [01:14<00:29, 801.07it/s]Running loglikelihood requests:  58%|█████▊    | 32309/56168 [01:14<00:29, 801.14it/s]Running loglikelihood requests:  58%|█████▊    | 32437/56168 [01:14<00:29, 800.83it/s]Running loglikelihood requests:  58%|█████▊    | 32569/56168 [01:14<00:29, 809.13it/s]Running loglikelihood requests:  58%|█████▊    | 32697/56168 [01:14<00:29, 806.39it/s]Running loglikelihood requests:  58%|█████▊    | 32829/56168 [01:14<00:28, 812.76it/s]Running loglikelihood requests:  59%|█████▊    | 32957/56168 [01:14<00:28, 809.46it/s]Running loglikelihood requests:  59%|█████▉    | 33085/56168 [01:15<00:28, 807.25it/s]Running loglikelihood requests:  59%|█████▉    | 33213/56168 [01:15<00:28, 805.72it/s]Running loglikelihood requests:  59%|█████▉    | 33341/56168 [01:15<00:28, 804.21it/s]Running loglikelihood requests:  60%|█████▉    | 33469/56168 [01:15<00:28, 804.65it/s]Running loglikelihood requests:  60%|█████▉    | 33597/56168 [01:15<00:28, 804.60it/s]Running loglikelihood requests:  60%|██████    | 33725/56168 [01:15<00:27, 804.21it/s]Running loglikelihood requests:  60%|██████    | 33853/56168 [01:16<00:27, 802.84it/s]Running loglikelihood requests:  60%|██████    | 33981/56168 [01:16<00:27, 802.74it/s]Running loglikelihood requests:  61%|██████    | 34109/56168 [01:16<00:27, 804.18it/s]Running loglikelihood requests:  61%|██████    | 34237/56168 [01:16<00:27, 804.33it/s]Running loglikelihood requests:  61%|██████    | 34365/56168 [01:16<00:27, 804.48it/s]Running loglikelihood requests:  61%|██████▏   | 34493/56168 [01:16<00:26, 804.20it/s]Running loglikelihood requests:  62%|██████▏   | 34621/56168 [01:17<00:26, 804.48it/s]Running loglikelihood requests:  62%|██████▏   | 34749/56168 [01:17<00:26, 804.94it/s]Running loglikelihood requests:  62%|██████▏   | 34877/56168 [01:17<00:26, 804.76it/s]Running loglikelihood requests:  62%|██████▏   | 35005/56168 [01:17<00:26, 804.04it/s]Running loglikelihood requests:  63%|██████▎   | 35133/56168 [01:17<00:26, 805.45it/s]Running loglikelihood requests:  63%|██████▎   | 35261/56168 [01:17<00:25, 807.67it/s]Running loglikelihood requests:  63%|██████▎   | 35389/56168 [01:17<00:25, 810.16it/s]Running loglikelihood requests:  63%|██████▎   | 35517/56168 [01:18<00:25, 811.00it/s]Running loglikelihood requests:  63%|██████▎   | 35645/56168 [01:18<00:25, 812.93it/s]Running loglikelihood requests:  64%|██████▎   | 35773/56168 [01:18<00:25, 813.49it/s]Running loglikelihood requests:  64%|██████▍   | 35901/56168 [01:18<00:24, 813.20it/s]Running loglikelihood requests:  64%|██████▍   | 36029/56168 [01:18<00:24, 812.13it/s]Running loglikelihood requests:  64%|██████▍   | 36157/56168 [01:18<00:24, 812.12it/s]Running loglikelihood requests:  65%|██████▍   | 36285/56168 [01:19<00:24, 815.69it/s]Running loglikelihood requests:  65%|██████▍   | 36413/56168 [01:19<00:24, 817.60it/s]Running loglikelihood requests:  65%|██████▌   | 36541/56168 [01:19<00:23, 819.31it/s]Running loglikelihood requests:  65%|██████▌   | 36669/56168 [01:19<00:23, 818.87it/s]Running loglikelihood requests:  66%|██████▌   | 36797/56168 [01:19<00:23, 819.40it/s]Running loglikelihood requests:  66%|██████▌   | 36925/56168 [01:19<00:23, 821.46it/s]Running loglikelihood requests:  66%|██████▌   | 37057/56168 [01:20<00:23, 829.10it/s]Running loglikelihood requests:  66%|██████▌   | 37185/56168 [01:20<00:22, 826.33it/s]Running loglikelihood requests:  66%|██████▋   | 37313/56168 [01:20<00:22, 823.79it/s]Running loglikelihood requests:  67%|██████▋   | 37441/56168 [01:20<00:22, 834.81it/s]Running loglikelihood requests:  67%|██████▋   | 37569/56168 [01:20<00:21, 852.74it/s]Running loglikelihood requests:  67%|██████▋   | 37697/56168 [01:20<00:21, 866.22it/s]Running loglikelihood requests:  67%|██████▋   | 37825/56168 [01:20<00:20, 875.98it/s]Running loglikelihood requests:  68%|██████▊   | 37953/56168 [01:21<00:20, 884.03it/s]Running loglikelihood requests:  68%|██████▊   | 38081/56168 [01:21<00:20, 889.73it/s]Running loglikelihood requests:  68%|██████▊   | 38209/56168 [01:21<00:20, 893.42it/s]Running loglikelihood requests:  68%|██████▊   | 38337/56168 [01:21<00:19, 895.84it/s]Running loglikelihood requests:  68%|██████▊   | 38465/56168 [01:21<00:19, 897.99it/s]Running loglikelihood requests:  69%|██████▊   | 38593/56168 [01:21<00:19, 898.26it/s]Running loglikelihood requests:  69%|██████▉   | 38721/56168 [01:21<00:19, 901.51it/s]Running loglikelihood requests:  69%|██████▉   | 38853/56168 [01:22<00:19, 911.19it/s]Running loglikelihood requests:  69%|██████▉   | 38981/56168 [01:22<00:18, 911.38it/s]Running loglikelihood requests:  70%|██████▉   | 39109/56168 [01:22<00:18, 909.63it/s]Running loglikelihood requests:  70%|██████▉   | 39237/56168 [01:22<00:18, 910.66it/s]Running loglikelihood requests:  70%|███████   | 39365/56168 [01:22<00:18, 910.98it/s]Running loglikelihood requests:  70%|███████   | 39493/56168 [01:22<00:18, 911.43it/s]Running loglikelihood requests:  71%|███████   | 39621/56168 [01:22<00:18, 912.88it/s]Running loglikelihood requests:  71%|███████   | 39749/56168 [01:23<00:17, 913.20it/s]Running loglikelihood requests:  71%|███████   | 39877/56168 [01:23<00:17, 914.40it/s]Running loglikelihood requests:  71%|███████   | 40005/56168 [01:23<00:17, 915.96it/s]Running loglikelihood requests:  71%|███████▏  | 40133/56168 [01:23<00:17, 916.98it/s]Running loglikelihood requests:  72%|███████▏  | 40261/56168 [01:23<00:17, 918.50it/s]Running loglikelihood requests:  72%|███████▏  | 40389/56168 [01:23<00:17, 919.36it/s]Running loglikelihood requests:  72%|███████▏  | 40517/56168 [01:23<00:16, 921.83it/s]Running loglikelihood requests:  72%|███████▏  | 40645/56168 [01:24<00:16, 923.59it/s]Running loglikelihood requests:  73%|███████▎  | 40773/56168 [01:24<00:16, 923.34it/s]Running loglikelihood requests:  73%|███████▎  | 40901/56168 [01:24<00:16, 923.74it/s]Running loglikelihood requests:  73%|███████▎  | 41029/56168 [01:24<00:16, 924.79it/s]Running loglikelihood requests:  73%|███████▎  | 41157/56168 [01:24<00:16, 926.98it/s]Running loglikelihood requests:  74%|███████▎  | 41285/56168 [01:24<00:16, 929.53it/s]Running loglikelihood requests:  74%|███████▎  | 41413/56168 [01:24<00:15, 929.00it/s]Running loglikelihood requests:  74%|███████▍  | 41545/56168 [01:24<00:15, 938.80it/s]Running loglikelihood requests:  74%|███████▍  | 41673/56168 [01:25<00:15, 937.70it/s]Running loglikelihood requests:  74%|███████▍  | 41801/56168 [01:25<00:15, 936.28it/s]Running loglikelihood requests:  75%|███████▍  | 41929/56168 [01:25<00:15, 936.11it/s]Running loglikelihood requests:  75%|███████▍  | 42065/56168 [01:25<00:14, 952.95it/s]Running loglikelihood requests:  75%|███████▌  | 42193/56168 [01:25<00:14, 948.26it/s]Running loglikelihood requests:  75%|███████▌  | 42321/56168 [01:25<00:14, 946.00it/s]Running loglikelihood requests:  76%|███████▌  | 42449/56168 [01:25<00:14, 944.07it/s]Running loglikelihood requests:  76%|███████▌  | 42577/56168 [01:26<00:14, 942.21it/s]Running loglikelihood requests:  76%|███████▌  | 42709/56168 [01:26<00:14, 950.16it/s]Running loglikelihood requests:  76%|███████▋  | 42837/56168 [01:26<00:14, 947.14it/s]Running loglikelihood requests:  76%|███████▋  | 42965/56168 [01:26<00:13, 945.79it/s]Running loglikelihood requests:  77%|███████▋  | 43093/56168 [01:26<00:13, 945.12it/s]Running loglikelihood requests:  77%|███████▋  | 43221/56168 [01:26<00:13, 944.76it/s]Running loglikelihood requests:  77%|███████▋  | 43349/56168 [01:26<00:13, 945.43it/s]Running loglikelihood requests:  77%|███████▋  | 43477/56168 [01:27<00:13, 944.66it/s]Running loglikelihood requests:  78%|███████▊  | 43605/56168 [01:27<00:13, 944.54it/s]Running loglikelihood requests:  78%|███████▊  | 43733/56168 [01:27<00:13, 945.51it/s]Running loglikelihood requests:  78%|███████▊  | 43861/56168 [01:27<00:13, 946.17it/s]Running loglikelihood requests:  78%|███████▊  | 43989/56168 [01:27<00:12, 946.59it/s]Running loglikelihood requests:  79%|███��███▊  | 44121/56168 [01:27<00:12, 958.40it/s]Running loglikelihood requests:  79%|███████▉  | 44249/56168 [01:27<00:12, 958.78it/s]Running loglikelihood requests:  79%|███████▉  | 44377/56168 [01:27<00:12, 957.10it/s]Running loglikelihood requests:  79%|███████▉  | 44505/56168 [01:28<00:12, 955.90it/s]Running loglikelihood requests:  79%|███████▉  | 44633/56168 [01:28<00:12, 957.87it/s]Running loglikelihood requests:  80%|███████▉  | 44761/56168 [01:28<00:11, 957.35it/s]Running loglikelihood requests:  80%|███████▉  | 44889/56168 [01:28<00:11, 959.97it/s]Running loglikelihood requests:  80%|████████  | 45017/56168 [01:28<00:11, 961.40it/s]Running loglikelihood requests:  80%|████████  | 45145/56168 [01:28<00:11, 959.55it/s]Running loglikelihood requests:  81%|████████  | 45273/56168 [01:28<00:11, 959.51it/s]Running loglikelihood requests:  81%|████████  | 45401/56168 [01:29<00:11, 962.32it/s]Running loglikelihood requests:  81%|████████  | 45529/56168 [01:29<00:11, 962.62it/s]Running loglikelihood requests:  81%|████████▏ | 45657/56168 [01:29<00:10, 963.62it/s]Running loglikelihood requests:  82%|████████▏ | 45785/56168 [01:29<00:10, 962.91it/s]Running loglikelihood requests:  82%|████████▏ | 45913/56168 [01:29<00:10, 963.03it/s]Running loglikelihood requests:  82%|████████▏ | 46041/56168 [01:29<00:10, 964.20it/s]Running loglikelihood requests:  82%|████████▏ | 46169/56168 [01:29<00:10, 963.66it/s]Running loglikelihood requests:  82%|████████▏ | 46297/56168 [01:29<00:10, 962.80it/s]Running loglikelihood requests:  83%|████████▎ | 46425/56168 [01:30<00:10, 964.13it/s]Running loglikelihood requests:  83%|████████▎ | 46553/56168 [01:30<00:09, 962.53it/s]Running loglikelihood requests:  83%|████████▎ | 46681/56168 [01:30<00:09, 962.55it/s]Running loglikelihood requests:  83%|████████▎ | 46809/56168 [01:30<00:09, 965.11it/s]Running loglikelihood requests:  84%|████████▎ | 46937/56168 [01:30<00:09, 964.52it/s]Running loglikelihood requests:  84%|████████▍ | 47065/56168 [01:30<00:09, 965.56it/s]Running loglikelihood requests:  84%|████████▍ | 47193/56168 [01:30<00:09, 966.97it/s]Running loglikelihood requests:  84%|████████▍ | 47321/56168 [01:31<00:09, 967.13it/s]Running loglikelihood requests:  84%|████████▍ | 47449/56168 [01:31<00:09, 967.36it/s]Running loglikelihood requests:  85%|████████▍ | 47577/56168 [01:31<00:08, 967.28it/s]Running loglikelihood requests:  85%|████████▍ | 47705/56168 [01:31<00:08, 967.68it/s]Running loglikelihood requests:  85%|████████▌ | 47833/56168 [01:31<00:08, 965.38it/s]Running loglikelihood requests:  85%|████████▌ | 47961/56168 [01:31<00:08, 967.24it/s]Running loglikelihood requests:  86%|████████▌ | 48089/56168 [01:31<00:08, 967.74it/s]Running loglikelihood requests:  86%|████████▌ | 48217/56168 [01:31<00:08, 965.68it/s]Running loglikelihood requests:  86%|████████▌ | 48345/56168 [01:32<00:07, 1009.63it/s]Running loglikelihood requests:  86%|████████▋ | 48477/56168 [01:32<00:07, 1051.68it/s]Running loglikelihood requests:  87%|████████▋ | 48605/56168 [01:32<00:07, 1068.19it/s]Running loglikelihood requests:  87%|████████▋ | 48733/56168 [01:32<00:06, 1086.65it/s]Running loglikelihood requests:  87%|████████▋ | 48861/56168 [01:32<00:06, 1098.53it/s]Running loglikelihood requests:  87%|████████▋ | 48989/56168 [01:32<00:06, 1104.96it/s]Running loglikelihood requests:  87%|████████▋ | 49125/56168 [01:32<00:06, 1133.19it/s]Running loglikelihood requests:  88%|████████▊ | 49253/56168 [01:32<00:06, 1133.90it/s]Running loglikelihood requests:  88%|████████▊ | 49381/56168 [01:32<00:05, 1131.72it/s]Running loglikelihood requests:  88%|████████▊ | 49509/56168 [01:33<00:05, 1132.97it/s]Running loglikelihood requests:  88%|████████▊ | 49637/56168 [01:33<00:05, 1131.26it/s]Running loglikelihood requests:  89%|████████▊ | 49765/56168 [01:33<00:05, 1130.76it/s]Running loglikelihood requests:  89%|████████▉ | 49893/56168 [01:33<00:05, 1136.39it/s]Running loglikelihood requests:  89%|████████▉ | 50021/56168 [01:33<00:05, 1135.41it/s]Running loglikelihood requests:  89%|████████▉ | 50149/56168 [01:33<00:05, 1135.74it/s]Running loglikelihood requests:  90%|████████▉ | 50277/56168 [01:33<00:05, 1138.30it/s]Running loglikelihood requests:  90%|████████▉ | 50405/56168 [01:33<00:05, 1136.81it/s]Running loglikelihood requests:  90%|████████▉ | 50533/56168 [01:33<00:04, 1139.02it/s]Running loglikelihood requests:  90%|█████████ | 50661/56168 [01:34<00:04, 1139.66it/s]Running loglikelihood requests:  90%|█████████ | 50793/56168 [01:34<00:04, 1149.67it/s]Running loglikelihood requests:  91%|█████████ | 50921/56168 [01:34<00:04, 1151.39it/s]Running loglikelihood requests:  91%|█████████ | 51049/56168 [01:34<00:04, 1145.37it/s]Running loglikelihood requests:  91%|█████████ | 51177/56168 [01:34<00:04, 1144.66it/s]Running loglikelihood requests:  91%|█████████▏| 51305/56168 [01:34<00:04, 1146.43it/s]Running loglikelihood requests:  92%|█████████▏| 51433/56168 [01:34<00:04, 1147.68it/s]Running loglikelihood requests:  92%|█████████▏| 51561/56168 [01:34<00:04, 1150.98it/s]Running loglikelihood requests:  92%|█████████▏| 51689/56168 [01:34<00:03, 1148.89it/s]Running loglikelihood requests:  92%|█████████▏| 51821/56168 [01:35<00:03, 1162.38it/s]Running loglikelihood requests:  92%|█████████▏| 51949/56168 [01:35<00:03, 1160.37it/s]Running loglikelihood requests:  93%|█████████▎| 52077/56168 [01:35<00:03, 1162.26it/s]Running loglikelihood requests:  93%|█████████▎| 52205/56168 [01:35<00:03, 1159.22it/s]Running loglikelihood requests:  93%|█████████▎| 52333/56168 [01:35<00:03, 1158.32it/s]Running loglikelihood requests:  93%|█████████▎| 52461/56168 [01:35<00:03, 1160.24it/s]Running loglikelihood requests:  94%|█████████▎| 52589/56168 [01:35<00:03, 1162.88it/s]Running loglikelihood requests:  94%|█████████▍| 52717/56168 [01:35<00:02, 1164.07it/s]Running loglikelihood requests:  94%|█████████▍| 52845/56168 [01:35<00:02, 1166.39it/s]Running loglikelihood requests:  94%|█████████▍| 52973/56168 [01:36<00:02, 1164.62it/s]Running loglikelihood requests:  95%|█████████▍| 53101/56168 [01:36<00:02, 1168.26it/s]Running loglikelihood requests:  95%|█████████▍| 53229/56168 [01:36<00:02, 1168.52it/s]Running loglikelihood requests:  95%|█████████▍| 53357/56168 [01:36<00:02, 1172.65it/s]Running loglikelihood requests:  95%|█████████▌| 53485/56168 [01:36<00:02, 1170.18it/s]Running loglikelihood requests:  95%|█████████▌| 53613/56168 [01:36<00:02, 1169.35it/s]Running loglikelihood requests:  96%|█████████▌| 53741/56168 [01:36<00:02, 1164.74it/s]Running loglikelihood requests:  96%|█████████▌| 53869/56168 [01:36<00:01, 1165.90it/s]Running loglikelihood requests:  96%|█████████▌| 53997/56168 [01:36<00:01, 1163.81it/s]Running loglikelihood requests:  96%|█████████▋| 54125/56168 [01:37<00:01, 1167.55it/s]Running loglikelihood requests:  97%|█████████▋| 54253/56168 [01:37<00:01, 1166.58it/s]Running loglikelihood requests:  97%|█████████▋| 54381/56168 [01:37<00:01, 1167.61it/s]Running loglikelihood requests:  97%|█████████▋| 54509/56168 [01:37<00:01, 1169.42it/s]Running loglikelihood requests:  97%|█████████▋| 54637/56168 [01:37<00:01, 1171.81it/s]Running loglikelihood requests:  98%|█████████▊| 54765/56168 [01:37<00:01, 1172.63it/s]Running loglikelihood requests:  98%|█████████▊| 54893/56168 [01:37<00:01, 1174.86it/s]Running loglikelihood requests:  98%|█████████▊| 55021/56168 [01:37<00:00, 1176.58it/s]Running loglikelihood requests:  98%|█████████▊| 55149/56168 [01:37<00:00, 1178.41it/s]Running loglikelihood requests:  98%|█████████▊| 55277/56168 [01:38<00:00, 1188.30it/s]Running loglikelihood requests:  99%|█████████▊| 55405/56168 [01:38<00:00, 1193.29it/s]Running loglikelihood requests:  99%|█████████▉| 55533/56168 [01:38<00:00, 1198.21it/s]Running loglikelihood requests:  99%|█████████▉| 55661/56168 [01:38<00:00, 1205.15it/s]Running loglikelihood requests:  99%|█████████▉| 55789/56168 [01:38<00:00, 1212.22it/s]Running loglikelihood requests: 100%|█████████▉| 55917/56168 [01:38<00:00, 1218.90it/s]Running loglikelihood requests: 100%|█████████▉| 56109/56168 [01:38<00:00, 1296.36it/s]Running loglikelihood requests: 100%|██████████| 56168/56168 [01:38<00:00, 569.05it/s] 
+2026-04-03:16:40:08,272 WARNING  [huggingface.py:1344] Failed to get model SHA for /egr/research-optml/wangc168/Muon_wmdp/rmu_wmdp/models/muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6 at revision main. Error: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/egr/research-optml/wangc168/Muon_wmdp/rmu_wmdp/models/muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6'. Use `repo_type` argument if needed.
+2026-04-03:16:40:13,621 INFO     [evaluation_tracker.py:206] Saving results aggregated
+2026-04-03:16:40:13,641 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_abstract_algebra
+2026-04-03:16:40:13,711 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_anatomy
+2026-04-03:16:40:13,809 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_astronomy
+2026-04-03:16:40:13,927 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_business_ethics
+2026-04-03:16:40:14,011 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_clinical_knowledge
+2026-04-03:16:40:14,215 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_college_biology
+2026-04-03:16:40:14,331 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_college_chemistry
+2026-04-03:16:40:14,423 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_college_computer_science
+2026-04-03:16:40:14,517 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_college_mathematics
+2026-04-03:16:40:14,607 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_college_medicine
+2026-04-03:16:40:14,749 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_college_physics
+2026-04-03:16:40:14,828 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_computer_security
+2026-04-03:16:40:14,915 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_conceptual_physics
+2026-04-03:16:40:15,093 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_econometrics
+2026-04-03:16:40:15,183 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_electrical_engineering
+2026-04-03:16:40:15,308 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_elementary_mathematics
+2026-04-03:16:40:15,632 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_formal_logic
+2026-04-03:16:40:15,732 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_global_facts
+2026-04-03:16:40:15,808 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_high_school_biology
+2026-04-03:16:40:16,059 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_high_school_chemistry
+2026-04-03:16:40:16,266 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_high_school_computer_science
+2026-04-03:16:40:16,375 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_high_school_european_history
+2026-04-03:16:40:16,583 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_high_school_geography
+2026-04-03:16:40:16,806 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_high_school_government_and_politics
+2026-04-03:16:40:17,022 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_high_school_macroeconomics
+2026-04-03:16:40:17,385 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_high_school_mathematics
+2026-04-03:16:40:17,684 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_high_school_microeconomics
+2026-04-03:16:40:17,929 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_high_school_physics
+2026-04-03:16:40:18,109 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_high_school_psychology
+2026-04-03:16:40:18,727 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_high_school_statistics
+2026-04-03:16:40:18,908 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_high_school_us_history
+2026-04-03:16:40:19,537 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_high_school_world_history
+2026-04-03:16:40:19,726 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_human_aging
+2026-04-03:16:40:19,895 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_human_sexuality
+2026-04-03:16:40:20,007 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_international_law
+2026-04-03:16:40:20,097 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_jurisprudence
+2026-04-03:16:40:20,195 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_logical_fallacies
+2026-04-03:16:40:20,341 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_machine_learning
+2026-04-03:16:40:20,436 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_management
+2026-04-03:16:40:20,524 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_marketing
+2026-04-03:16:40:20,789 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_medical_genetics
+2026-04-03:16:40:20,897 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_miscellaneous
+2026-04-03:16:40:21,555 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_moral_disputes
+2026-04-03:16:40:21,850 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_moral_scenarios
+2026-04-03:16:40:22,676 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_nutrition
+2026-04-03:16:40:22,944 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_philosophy
+2026-04-03:16:40:23,198 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_prehistory
+2026-04-03:16:40:23,473 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_professional_accounting
+2026-04-03:16:40:23,719 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_professional_law
+2026-04-03:16:40:25,243 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_professional_medicine
+2026-04-03:16:40:25,522 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_professional_psychology
+2026-04-03:16:40:26,010 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_public_relations
+2026-04-03:16:40:26,094 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_security_studies
+2026-04-03:16:40:26,264 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_sociology
+2026-04-03:16:40:26,404 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_us_foreign_policy
+2026-04-03:16:40:26,475 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_virology
+2026-04-03:16:40:26,587 INFO     [evaluation_tracker.py:287] Saving per-sample results for: mmlu_world_religions
+hf (pretrained=/egr/research-optml/wangc168/Muon_wmdp/rmu_wmdp/models/muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 16
+|                 Tasks                 |Version|Filter|n-shot|Metric|   |Value |   |Stderr|
+|---------------------------------------|------:|------|-----:|------|---|-----:|---|-----:|
+|mmlu                                   |      2|none  |      |acc   |↑  |0.5713|±  |0.0040|
+| - humanities                          |      2|none  |      |acc   |↑  |0.5186|±  |0.0069|
+|  - formal_logic                       |      1|none  |     0|acc   |↑  |0.3810|±  |0.0434|
+|  - high_school_european_history       |      1|none  |     0|acc   |↑  |0.6970|±  |0.0359|
+|  - high_school_us_history             |      1|none  |     0|acc   |↑  |0.7500|±  |0.0304|
+|  - high_school_world_history          |      1|none  |     0|acc   |↑  |0.7384|±  |0.0286|
+|  - international_law                  |      1|none  |     0|acc   |↑  |0.7107|±  |0.0414|
+|  - jurisprudence                      |      1|none  |     0|acc   |↑  |0.6944|±  |0.0445|
+|  - logical_fallacies                  |      1|none  |     0|acc   |↑  |0.6810|±  |0.0366|
+|  - moral_disputes                     |      1|none  |     0|acc   |↑  |0.6590|±  |0.0255|
+|  - moral_scenarios                    |      1|none  |     0|acc   |↑  |0.2804|±  |0.0150|
+|  - philosophy                         |      1|none  |     0|acc   |↑  |0.6463|±  |0.0272|
+|  - prehistory                         |      1|none  |     0|acc   |↑  |0.6451|±  |0.0266|
+|  - professional_law                   |      1|none  |     0|acc   |↑  |0.4231|±  |0.0126|
+|  - world_religions                    |      1|none  |     0|acc   |↑  |0.8129|±  |0.0299|
+| - other                               |      2|none  |      |acc   |↑  |0.6350|±  |0.0082|
+|  - business_ethics                    |      1|none  |     0|acc   |↑  |0.5200|±  |0.0502|
+|  - clinical_knowledge                 |      1|none  |     0|acc   |↑  |0.6491|±  |0.0294|
+|  - college_medicine                   |      1|none  |     0|acc   |↑  |0.5896|±  |0.0375|
+|  - global_facts                       |      1|none  |     0|acc   |↑  |0.3000|±  |0.0461|
+|  - human_aging                        |      1|none  |     0|acc   |↑  |0.6368|±  |0.0323|
+|  - management                         |      1|none  |     0|acc   |↑  |0.7476|±  |0.0430|
+|  - marketing                          |      1|none  |     0|acc   |↑  |0.8248|±  |0.0249|
+|  - medical_genetics                   |      1|none  |     0|acc   |↑  |0.6400|±  |0.0482|
+|  - miscellaneous                      |      1|none  |     0|acc   |↑  |0.7778|±  |0.0149|
+|  - nutrition                          |      1|none  |     0|acc   |↑  |0.6601|±  |0.0271|
+|  - professional_accounting            |      1|none  |     0|acc   |↑  |0.4149|±  |0.0294|
+|  - professional_medicine              |      1|none  |     0|acc   |↑  |0.5699|±  |0.0301|
+|  - virology                           |      1|none  |     0|acc   |↑  |0.3494|±  |0.0371|
+| - social sciences                     |      2|none  |      |acc   |↑  |0.6783|±  |0.0082|
+|  - econometrics                       |      1|none  |     0|acc   |↑  |0.4298|±  |0.0466|
+|  - high_school_geography              |      1|none  |     0|acc   |↑  |0.7172|±  |0.0321|
+|  - high_school_government_and_politics|      1|none  |     0|acc   |↑  |0.8290|±  |0.0272|
+|  - high_school_macroeconomics         |      1|none  |     0|acc   |↑  |0.5795|±  |0.0250|
+|  - high_school_microeconomics         |      1|none  |     0|acc   |↑  |0.6050|±  |0.0318|
+|  - high_school_psychology             |      1|none  |     0|acc   |↑  |0.7890|±  |0.0175|
+|  - human_sexuality                    |      1|none  |     0|acc   |↑  |0.6565|±  |0.0416|
+|  - professional_psychology            |      1|none  |     0|acc   |↑  |0.5997|±  |0.0198|
+|  - public_relations                   |      1|none  |     0|acc   |↑  |0.6545|±  |0.0455|
+|  - security_studies                   |      1|none  |     0|acc   |↑  |0.6490|±  |0.0306|
+|  - sociology                          |      1|none  |     0|acc   |↑  |0.8557|±  |0.0248|
+|  - us_foreign_policy                  |      1|none  |     0|acc   |↑  |0.8000|±  |0.0402|
+| - stem                                |      2|none  |      |acc   |↑  |0.4827|±  |0.0087|
+|  - abstract_algebra                   |      1|none  |     0|acc   |↑  |0.3000|±  |0.0461|
+|  - anatomy                            |      1|none  |     0|acc   |↑  |0.5630|±  |0.0428|
+|  - astronomy                          |      1|none  |     0|acc   |↑  |0.5855|±  |0.0401|
+|  - college_biology                    |      1|none  |     0|acc   |↑  |0.6389|±  |0.0402|
+|  - college_chemistry                  |      1|none  |     0|acc   |↑  |0.4900|±  |0.0502|
+|  - college_computer_science           |      1|none  |     0|acc   |↑  |0.5100|±  |0.0502|
+|  - college_mathematics                |      1|none  |     0|acc   |↑  |0.3700|±  |0.0485|
+|  - college_physics                    |      1|none  |     0|acc   |↑  |0.5294|±  |0.0497|
+|  - computer_security                  |      1|none  |     0|acc   |↑  |0.4800|±  |0.0502|
+|  - conceptual_physics                 |      1|none  |     0|acc   |↑  |0.4936|±  |0.0327|
+|  - electrical_engineering             |      1|none  |     0|acc   |↑  |0.5310|±  |0.0416|
+|  - elementary_mathematics             |      1|none  |     0|acc   |↑  |0.3995|±  |0.0252|
+|  - high_school_biology                |      1|none  |     0|acc   |↑  |0.6581|±  |0.0270|
+|  - high_school_chemistry              |      1|none  |     0|acc   |↑  |0.4828|±  |0.0352|
+|  - high_school_computer_science       |      1|none  |     0|acc   |↑  |0.5700|±  |0.0498|
+|  - high_school_mathematics            |      1|none  |     0|acc   |↑  |0.3370|±  |0.0288|
+|  - high_school_physics                |      1|none  |     0|acc   |↑  |0.2914|±  |0.0371|
+|  - high_school_statistics             |      1|none  |     0|acc   |↑  |0.4907|±  |0.0341|
+|  - machine_learning                   |      1|none  |     0|acc   |↑  |0.4643|±  |0.0473|
+
+|      Groups      |Version|Filter|n-shot|Metric|   |Value |   |Stderr|
+|------------------|------:|------|------|------|---|-----:|---|-----:|
+|mmlu              |      2|none  |      |acc   |↑  |0.5713|±  |0.0040|
+| - humanities     |      2|none  |      |acc   |↑  |0.5186|±  |0.0069|
+| - other          |      2|none  |      |acc   |↑  |0.6350|±  |0.0082|
+| - social sciences|      2|none  |      |acc   |↑  |0.6783|±  |0.0082|
+| - stem           |      2|none  |      |acc   |↑  |0.4827|±  |0.0087|
+
+[INFO] (mmlu) success (found result table and non-empty json).
+[DONE] logs saved under: /egr/research-optml/wangc168/Muon_wmdp/rmu_wmdp/models/muon_v1_test_V1_K64_batches150_bs4_alpha1200-1200_steer15-15_muonlr2.6e-3_muonmom0.95_muonwd0.1_adamlr5e-5_seed42_layer7_layers5-6-7_params6/lm_eval_logs