gatmiry commited on 27 days ago

Commit

c7f1373

verified ·

1 Parent(s): beda614

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +329 -0
__pycache__/model_tbyt_train.cpython-312.pyc +0 -0
attn_by_number_worker.py +194 -0
gpu_worker.py +236 -73
hijack_layer1_worker.py +229 -0
model_tbyt_train.py +123 -0
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/ablation_accuracy.png +0 -0
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/ablation_conditional_accuracy.png +3 -0
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/ablation_per_position.png +3 -0
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/attn_heatmaps.png +3 -0
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/avg_attn_by_number_layer0.npz +3 -0
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/avg_attn_by_number_layer0.png +3 -0
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/avg_attn_by_number_layer1.npz +3 -0
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/avg_attn_by_number_layer1.png +3 -0
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/baseline_accuracy.png +0 -0
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/baseline_conditional_accuracy.png +3 -0
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/cinclogits_layer0.png +3 -0
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/cinclogits_layer1.png +3 -0
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/hijack_breaking_rate_bynext_heatmap_layer0.png +3 -0
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/hijack_breaking_rate_heatmap_layer0.png +3 -0
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/hijack_breaking_rate_heatmap_layer1.png +0 -0
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/hijack_hijack_rate_bynext_heatmap_layer0.png +3 -0
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/hijack_hijack_rate_heatmap_layer0.png +0 -0
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/hijack_hijack_rate_heatmap_layer1.png +3 -0
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/hijack_sample_count_heatmap_layer0.png +0 -0
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/hijack_sample_count_heatmap_layer1.png +3 -0
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer0.png +0 -0
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer0_asym_ub60_lb60.png +3 -0
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer0_ub10.png +0 -0
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer0_ub15.png +0 -0
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer0_ub20.png +0 -0
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer0_ub30.png +0 -0
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer0_ub50.png +0 -0
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer0_ub60.png +0 -0
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer0_ub60_high.png +0 -0
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer1.png +0 -0
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer1_ub10.png +0 -0
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer1_ub15.png +0 -0
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer1_ub20.png +0 -0
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer1_ub30.png +0 -0
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer1_ub50.png +0 -0
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer1_ub60.png +0 -0
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer1_ub60_high.png +0 -0
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intervention_pernumber_random_layer0.png +3 -0
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intervention_pernumber_separator_layer0.png +3 -0
outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/ablation_accuracy.png +0 -0
outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/ablation_conditional_accuracy.png +3 -0
outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/ablation_per_position.png +3 -0
outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/attn_heatmaps.png +3 -0
outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/avg_attn_by_number_layer0.npz +3 -0

.gitattributes CHANGED Viewed

@@ -278,3 +278,332 @@ outputs/plots_N256_B16_ds1338_is1340_final/perlocation/perlocation_layer0.png fi
 outputs/plots_N256_B16_ds1338_is1340_final/perlocation/perlocation_layer1.png filter=lfs diff=lfs merge=lfs -text
 outputs/plots_N256_B16_ds1338_is1340_final/pernumber/pernumber_layer0.png filter=lfs diff=lfs merge=lfs -text
 outputs/plots_N256_B16_ds1338_is1340_final/pernumber/pernumber_layer1.png filter=lfs diff=lfs merge=lfs -text

 outputs/plots_N256_B16_ds1338_is1340_final/perlocation/perlocation_layer1.png filter=lfs diff=lfs merge=lfs -text
 outputs/plots_N256_B16_ds1338_is1340_final/pernumber/pernumber_layer0.png filter=lfs diff=lfs merge=lfs -text
 outputs/plots_N256_B16_ds1338_is1340_final/pernumber/pernumber_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/ablation_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/ablation_per_position.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/attn_heatmaps.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/avg_attn_by_number_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/avg_attn_by_number_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/baseline_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/cinclogits_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/cinclogits_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/hijack_breaking_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/hijack_breaking_rate_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/hijack_hijack_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/hijack_hijack_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/hijack_sample_count_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer0_asym_ub60_lb60.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intervention_pernumber_random_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intervention_pernumber_separator_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/ablation_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/ablation_per_position.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/attn_heatmaps.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/avg_attn_by_number_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/avg_attn_by_number_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/baseline_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/cinclogits_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/cinclogits_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/hijack_breaking_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/hijack_breaking_rate_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/hijack_breaking_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/hijack_hijack_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/hijack_hijack_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/intensity_layer0_asym_ub60_lb60.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/intervention_pernumber_random_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/intervention_pernumber_separator_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI150000_E64_H1_L2_ds1337_is1337_ckpt150000/ablation_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI150000_E64_H1_L2_ds1337_is1337_ckpt150000/ablation_per_position.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI150000_E64_H1_L2_ds1337_is1337_ckpt150000/attn_heatmaps.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI150000_E64_H1_L2_ds1337_is1337_ckpt150000/avg_attn_by_number_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI150000_E64_H1_L2_ds1337_is1337_ckpt150000/baseline_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI150000_E64_H1_L2_ds1337_is1337_ckpt150000/cinclogits_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI150000_E64_H1_L2_ds1337_is1337_ckpt150000/cinclogits_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI150000_E64_H1_L2_ds1337_is1337_ckpt150000/hijack_breaking_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI150000_E64_H1_L2_ds1337_is1337_ckpt150000/hijack_breaking_rate_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI150000_E64_H1_L2_ds1337_is1337_ckpt150000/hijack_breaking_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI150000_E64_H1_L2_ds1337_is1337_ckpt150000/hijack_hijack_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI150000_E64_H1_L2_ds1337_is1337_ckpt150000/hijack_hijack_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI150000_E64_H1_L2_ds1337_is1337_ckpt150000/hijack_sample_count_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI150000_E64_H1_L2_ds1337_is1337_ckpt150000/intensity_layer0_asym_ub60_lb60.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI150000_E64_H1_L2_ds1337_is1337_ckpt150000/intervention_pernumber_random_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI150000_E64_H1_L2_ds1337_is1337_ckpt150000/intervention_pernumber_separator_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI200000_E64_H1_L2_ds1337_is1337_ckpt200000/ablation_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI200000_E64_H1_L2_ds1337_is1337_ckpt200000/ablation_per_position.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI200000_E64_H1_L2_ds1337_is1337_ckpt200000/attn_heatmaps.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI200000_E64_H1_L2_ds1337_is1337_ckpt200000/avg_attn_by_number_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI200000_E64_H1_L2_ds1337_is1337_ckpt200000/baseline_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI200000_E64_H1_L2_ds1337_is1337_ckpt200000/cinclogits_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI200000_E64_H1_L2_ds1337_is1337_ckpt200000/cinclogits_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI200000_E64_H1_L2_ds1337_is1337_ckpt200000/hijack_breaking_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI200000_E64_H1_L2_ds1337_is1337_ckpt200000/hijack_breaking_rate_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI200000_E64_H1_L2_ds1337_is1337_ckpt200000/hijack_breaking_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI200000_E64_H1_L2_ds1337_is1337_ckpt200000/hijack_hijack_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI200000_E64_H1_L2_ds1337_is1337_ckpt200000/hijack_hijack_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI200000_E64_H1_L2_ds1337_is1337_ckpt200000/intensity_layer0_asym_ub60_lb60.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI200000_E64_H1_L2_ds1337_is1337_ckpt200000/intervention_pernumber_random_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI200000_E64_H1_L2_ds1337_is1337_ckpt200000/intervention_pernumber_separator_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI250000_E64_H1_L2_ds1337_is1337_ckpt250000/ablation_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI250000_E64_H1_L2_ds1337_is1337_ckpt250000/ablation_per_position.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI250000_E64_H1_L2_ds1337_is1337_ckpt250000/attn_heatmaps.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI250000_E64_H1_L2_ds1337_is1337_ckpt250000/avg_attn_by_number_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI250000_E64_H1_L2_ds1337_is1337_ckpt250000/avg_attn_by_number_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI250000_E64_H1_L2_ds1337_is1337_ckpt250000/baseline_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI250000_E64_H1_L2_ds1337_is1337_ckpt250000/cinclogits_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI250000_E64_H1_L2_ds1337_is1337_ckpt250000/cinclogits_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI250000_E64_H1_L2_ds1337_is1337_ckpt250000/hijack_breaking_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI250000_E64_H1_L2_ds1337_is1337_ckpt250000/hijack_breaking_rate_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI250000_E64_H1_L2_ds1337_is1337_ckpt250000/hijack_breaking_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI250000_E64_H1_L2_ds1337_is1337_ckpt250000/hijack_hijack_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI250000_E64_H1_L2_ds1337_is1337_ckpt250000/hijack_hijack_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI250000_E64_H1_L2_ds1337_is1337_ckpt250000/hijack_sample_count_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI250000_E64_H1_L2_ds1337_is1337_ckpt250000/intensity_layer0_asym_ub60_lb60.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI250000_E64_H1_L2_ds1337_is1337_ckpt250000/intervention_pernumber_random_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI300000_E64_H1_L2_ds1337_is1337_ckpt300000/ablation_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI300000_E64_H1_L2_ds1337_is1337_ckpt300000/ablation_per_position.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI300000_E64_H1_L2_ds1337_is1337_ckpt300000/attn_heatmaps.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI300000_E64_H1_L2_ds1337_is1337_ckpt300000/avg_attn_by_number_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI300000_E64_H1_L2_ds1337_is1337_ckpt300000/avg_attn_by_number_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI300000_E64_H1_L2_ds1337_is1337_ckpt300000/baseline_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI300000_E64_H1_L2_ds1337_is1337_ckpt300000/cinclogits_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI300000_E64_H1_L2_ds1337_is1337_ckpt300000/cinclogits_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI300000_E64_H1_L2_ds1337_is1337_ckpt300000/hijack_breaking_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI300000_E64_H1_L2_ds1337_is1337_ckpt300000/hijack_breaking_rate_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI300000_E64_H1_L2_ds1337_is1337_ckpt300000/hijack_breaking_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI300000_E64_H1_L2_ds1337_is1337_ckpt300000/hijack_hijack_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI300000_E64_H1_L2_ds1337_is1337_ckpt300000/hijack_hijack_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI300000_E64_H1_L2_ds1337_is1337_ckpt300000/intensity_layer0_asym_ub60_lb60.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI300000_E64_H1_L2_ds1337_is1337_ckpt300000/intervention_pernumber_random_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI300000_E64_H1_L2_ds1337_is1337_ckpt300000/intervention_pernumber_separator_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI350000_E64_H1_L2_ds1337_is1337_ckpt350000/ablation_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI350000_E64_H1_L2_ds1337_is1337_ckpt350000/ablation_per_position.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI350000_E64_H1_L2_ds1337_is1337_ckpt350000/attn_heatmaps.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI350000_E64_H1_L2_ds1337_is1337_ckpt350000/avg_attn_by_number_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI350000_E64_H1_L2_ds1337_is1337_ckpt350000/avg_attn_by_number_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI350000_E64_H1_L2_ds1337_is1337_ckpt350000/baseline_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI350000_E64_H1_L2_ds1337_is1337_ckpt350000/cinclogits_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI350000_E64_H1_L2_ds1337_is1337_ckpt350000/cinclogits_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI350000_E64_H1_L2_ds1337_is1337_ckpt350000/hijack_breaking_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI350000_E64_H1_L2_ds1337_is1337_ckpt350000/hijack_breaking_rate_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI350000_E64_H1_L2_ds1337_is1337_ckpt350000/hijack_breaking_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI350000_E64_H1_L2_ds1337_is1337_ckpt350000/hijack_hijack_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI350000_E64_H1_L2_ds1337_is1337_ckpt350000/hijack_hijack_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI350000_E64_H1_L2_ds1337_is1337_ckpt350000/intensity_layer0_asym_ub60_lb60.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI350000_E64_H1_L2_ds1337_is1337_ckpt350000/intervention_pernumber_random_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI350000_E64_H1_L2_ds1337_is1337_ckpt350000/intervention_pernumber_separator_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI400000_E64_H1_L2_ds1337_is1337_ckpt400000/ablation_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI400000_E64_H1_L2_ds1337_is1337_ckpt400000/ablation_per_position.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI400000_E64_H1_L2_ds1337_is1337_ckpt400000/attn_heatmaps.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI400000_E64_H1_L2_ds1337_is1337_ckpt400000/avg_attn_by_number_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI400000_E64_H1_L2_ds1337_is1337_ckpt400000/avg_attn_by_number_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI400000_E64_H1_L2_ds1337_is1337_ckpt400000/baseline_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI400000_E64_H1_L2_ds1337_is1337_ckpt400000/cinclogits_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI400000_E64_H1_L2_ds1337_is1337_ckpt400000/cinclogits_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI400000_E64_H1_L2_ds1337_is1337_ckpt400000/hijack_breaking_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI400000_E64_H1_L2_ds1337_is1337_ckpt400000/hijack_breaking_rate_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI400000_E64_H1_L2_ds1337_is1337_ckpt400000/hijack_hijack_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI400000_E64_H1_L2_ds1337_is1337_ckpt400000/hijack_hijack_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI400000_E64_H1_L2_ds1337_is1337_ckpt400000/hijack_sample_count_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI400000_E64_H1_L2_ds1337_is1337_ckpt400000/intensity_layer0_asym_ub60_lb60.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI400000_E64_H1_L2_ds1337_is1337_ckpt400000/intervention_pernumber_random_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI400000_E64_H1_L2_ds1337_is1337_ckpt400000/intervention_pernumber_separator_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI450000_E64_H1_L2_ds1337_is1337_ckpt450000/ablation_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI450000_E64_H1_L2_ds1337_is1337_ckpt450000/ablation_per_position.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI450000_E64_H1_L2_ds1337_is1337_ckpt450000/attn_heatmaps.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI450000_E64_H1_L2_ds1337_is1337_ckpt450000/avg_attn_by_number_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI450000_E64_H1_L2_ds1337_is1337_ckpt450000/avg_attn_by_number_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI450000_E64_H1_L2_ds1337_is1337_ckpt450000/baseline_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI450000_E64_H1_L2_ds1337_is1337_ckpt450000/cinclogits_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI450000_E64_H1_L2_ds1337_is1337_ckpt450000/cinclogits_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI450000_E64_H1_L2_ds1337_is1337_ckpt450000/hijack_breaking_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI450000_E64_H1_L2_ds1337_is1337_ckpt450000/hijack_breaking_rate_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI450000_E64_H1_L2_ds1337_is1337_ckpt450000/hijack_breaking_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI450000_E64_H1_L2_ds1337_is1337_ckpt450000/hijack_hijack_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI450000_E64_H1_L2_ds1337_is1337_ckpt450000/hijack_hijack_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI450000_E64_H1_L2_ds1337_is1337_ckpt450000/hijack_sample_count_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI450000_E64_H1_L2_ds1337_is1337_ckpt450000/intensity_layer0_asym_ub60_lb60.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI450000_E64_H1_L2_ds1337_is1337_ckpt450000/intervention_pernumber_random_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI450000_E64_H1_L2_ds1337_is1337_ckpt450000/intervention_pernumber_separator_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI500000_E64_H1_L2_ds1337_is1337_ckpt500000/ablation_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI500000_E64_H1_L2_ds1337_is1337_ckpt500000/ablation_per_position.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI500000_E64_H1_L2_ds1337_is1337_ckpt500000/attn_heatmaps.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI500000_E64_H1_L2_ds1337_is1337_ckpt500000/avg_attn_by_number_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI500000_E64_H1_L2_ds1337_is1337_ckpt500000/avg_attn_by_number_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI500000_E64_H1_L2_ds1337_is1337_ckpt500000/baseline_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI500000_E64_H1_L2_ds1337_is1337_ckpt500000/cinclogits_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI500000_E64_H1_L2_ds1337_is1337_ckpt500000/hijack_breaking_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI500000_E64_H1_L2_ds1337_is1337_ckpt500000/hijack_breaking_rate_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI500000_E64_H1_L2_ds1337_is1337_ckpt500000/hijack_breaking_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI500000_E64_H1_L2_ds1337_is1337_ckpt500000/hijack_hijack_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI500000_E64_H1_L2_ds1337_is1337_ckpt500000/hijack_hijack_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI500000_E64_H1_L2_ds1337_is1337_ckpt500000/hijack_sample_count_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI500000_E64_H1_L2_ds1337_is1337_ckpt500000/intensity_layer0_asym_ub60_lb60.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI500000_E64_H1_L2_ds1337_is1337_ckpt500000/intervention_pernumber_random_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI500000_E64_H1_L2_ds1337_is1337_ckpt500000/intervention_pernumber_separator_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI50000_E64_H1_L2_ds1337_is1337_ckpt50000/ablation_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI50000_E64_H1_L2_ds1337_is1337_ckpt50000/ablation_per_position.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI50000_E64_H1_L2_ds1337_is1337_ckpt50000/attn_heatmaps.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI50000_E64_H1_L2_ds1337_is1337_ckpt50000/avg_attn_by_number_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI50000_E64_H1_L2_ds1337_is1337_ckpt50000/avg_attn_by_number_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI50000_E64_H1_L2_ds1337_is1337_ckpt50000/baseline_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI50000_E64_H1_L2_ds1337_is1337_ckpt50000/cinclogits_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI50000_E64_H1_L2_ds1337_is1337_ckpt50000/cinclogits_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI50000_E64_H1_L2_ds1337_is1337_ckpt50000/hijack_breaking_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI50000_E64_H1_L2_ds1337_is1337_ckpt50000/hijack_breaking_rate_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI50000_E64_H1_L2_ds1337_is1337_ckpt50000/hijack_breaking_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI50000_E64_H1_L2_ds1337_is1337_ckpt50000/hijack_hijack_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI50000_E64_H1_L2_ds1337_is1337_ckpt50000/hijack_hijack_rate_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI50000_E64_H1_L2_ds1337_is1337_ckpt50000/hijack_hijack_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI50000_E64_H1_L2_ds1337_is1337_ckpt50000/hijack_sample_count_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI50000_E64_H1_L2_ds1337_is1337_ckpt50000/intensity_layer0_asym_ub60_lb60.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI50000_E64_H1_L2_ds1337_is1337_ckpt50000/intervention_pernumber_random_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI50000_E64_H1_L2_ds1337_is1337_ckpt50000/intervention_pernumber_separator_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI550000_E64_H1_L2_ds1337_is1337_ckpt550000/ablation_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI550000_E64_H1_L2_ds1337_is1337_ckpt550000/ablation_per_position.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI550000_E64_H1_L2_ds1337_is1337_ckpt550000/attn_heatmaps.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI550000_E64_H1_L2_ds1337_is1337_ckpt550000/avg_attn_by_number_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI550000_E64_H1_L2_ds1337_is1337_ckpt550000/avg_attn_by_number_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI550000_E64_H1_L2_ds1337_is1337_ckpt550000/baseline_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI550000_E64_H1_L2_ds1337_is1337_ckpt550000/cinclogits_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI550000_E64_H1_L2_ds1337_is1337_ckpt550000/cinclogits_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI550000_E64_H1_L2_ds1337_is1337_ckpt550000/hijack_breaking_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI550000_E64_H1_L2_ds1337_is1337_ckpt550000/hijack_breaking_rate_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI550000_E64_H1_L2_ds1337_is1337_ckpt550000/hijack_breaking_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI550000_E64_H1_L2_ds1337_is1337_ckpt550000/hijack_hijack_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI550000_E64_H1_L2_ds1337_is1337_ckpt550000/hijack_hijack_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI550000_E64_H1_L2_ds1337_is1337_ckpt550000/hijack_sample_count_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI550000_E64_H1_L2_ds1337_is1337_ckpt550000/intensity_layer0_asym_ub60_lb60.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI550000_E64_H1_L2_ds1337_is1337_ckpt550000/intervention_pernumber_random_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI550000_E64_H1_L2_ds1337_is1337_ckpt550000/intervention_pernumber_separator_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI600000_E64_H1_L2_ds1337_is1337_ckpt600000/ablation_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI600000_E64_H1_L2_ds1337_is1337_ckpt600000/ablation_per_position.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI600000_E64_H1_L2_ds1337_is1337_ckpt600000/attn_heatmaps.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI600000_E64_H1_L2_ds1337_is1337_ckpt600000/avg_attn_by_number_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI600000_E64_H1_L2_ds1337_is1337_ckpt600000/avg_attn_by_number_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI600000_E64_H1_L2_ds1337_is1337_ckpt600000/baseline_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI600000_E64_H1_L2_ds1337_is1337_ckpt600000/cinclogits_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI600000_E64_H1_L2_ds1337_is1337_ckpt600000/cinclogits_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI600000_E64_H1_L2_ds1337_is1337_ckpt600000/hijack_breaking_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI600000_E64_H1_L2_ds1337_is1337_ckpt600000/hijack_breaking_rate_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI600000_E64_H1_L2_ds1337_is1337_ckpt600000/hijack_breaking_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI600000_E64_H1_L2_ds1337_is1337_ckpt600000/hijack_hijack_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI600000_E64_H1_L2_ds1337_is1337_ckpt600000/hijack_hijack_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI600000_E64_H1_L2_ds1337_is1337_ckpt600000/intensity_layer0_asym_ub60_lb60.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI600000_E64_H1_L2_ds1337_is1337_ckpt600000/intervention_pernumber_random_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI600000_E64_H1_L2_ds1337_is1337_ckpt600000/intervention_pernumber_separator_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI650000_E64_H1_L2_ds1337_is1337_ckpt650000/ablation_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI650000_E64_H1_L2_ds1337_is1337_ckpt650000/ablation_per_position.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI650000_E64_H1_L2_ds1337_is1337_ckpt650000/attn_heatmaps.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI650000_E64_H1_L2_ds1337_is1337_ckpt650000/avg_attn_by_number_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI650000_E64_H1_L2_ds1337_is1337_ckpt650000/avg_attn_by_number_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI650000_E64_H1_L2_ds1337_is1337_ckpt650000/baseline_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI650000_E64_H1_L2_ds1337_is1337_ckpt650000/cinclogits_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI650000_E64_H1_L2_ds1337_is1337_ckpt650000/cinclogits_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI650000_E64_H1_L2_ds1337_is1337_ckpt650000/hijack_breaking_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI650000_E64_H1_L2_ds1337_is1337_ckpt650000/hijack_breaking_rate_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI650000_E64_H1_L2_ds1337_is1337_ckpt650000/hijack_breaking_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI650000_E64_H1_L2_ds1337_is1337_ckpt650000/hijack_hijack_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI650000_E64_H1_L2_ds1337_is1337_ckpt650000/hijack_hijack_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI650000_E64_H1_L2_ds1337_is1337_ckpt650000/hijack_sample_count_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI650000_E64_H1_L2_ds1337_is1337_ckpt650000/intensity_layer0_asym_ub60_lb60.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI650000_E64_H1_L2_ds1337_is1337_ckpt650000/intervention_pernumber_random_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI650000_E64_H1_L2_ds1337_is1337_ckpt650000/intervention_pernumber_separator_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI700000_E64_H1_L2_ds1337_is1337_ckpt700000/ablation_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI700000_E64_H1_L2_ds1337_is1337_ckpt700000/ablation_per_position.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI700000_E64_H1_L2_ds1337_is1337_ckpt700000/attn_heatmaps.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI700000_E64_H1_L2_ds1337_is1337_ckpt700000/avg_attn_by_number_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI700000_E64_H1_L2_ds1337_is1337_ckpt700000/avg_attn_by_number_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI700000_E64_H1_L2_ds1337_is1337_ckpt700000/baseline_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI700000_E64_H1_L2_ds1337_is1337_ckpt700000/cinclogits_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI700000_E64_H1_L2_ds1337_is1337_ckpt700000/cinclogits_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI700000_E64_H1_L2_ds1337_is1337_ckpt700000/hijack_breaking_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI700000_E64_H1_L2_ds1337_is1337_ckpt700000/hijack_breaking_rate_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI700000_E64_H1_L2_ds1337_is1337_ckpt700000/hijack_breaking_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI700000_E64_H1_L2_ds1337_is1337_ckpt700000/hijack_hijack_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI700000_E64_H1_L2_ds1337_is1337_ckpt700000/hijack_hijack_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI700000_E64_H1_L2_ds1337_is1337_ckpt700000/intensity_layer0_asym_ub60_lb60.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI700000_E64_H1_L2_ds1337_is1337_ckpt700000/intervention_pernumber_random_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI700000_E64_H1_L2_ds1337_is1337_ckpt700000/intervention_pernumber_separator_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI750000_E64_H1_L2_ds1337_is1337_ckpt750000/ablation_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI750000_E64_H1_L2_ds1337_is1337_ckpt750000/ablation_per_position.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI750000_E64_H1_L2_ds1337_is1337_ckpt750000/attn_heatmaps.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI750000_E64_H1_L2_ds1337_is1337_ckpt750000/avg_attn_by_number_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI750000_E64_H1_L2_ds1337_is1337_ckpt750000/avg_attn_by_number_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI750000_E64_H1_L2_ds1337_is1337_ckpt750000/baseline_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI750000_E64_H1_L2_ds1337_is1337_ckpt750000/cinclogits_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI750000_E64_H1_L2_ds1337_is1337_ckpt750000/cinclogits_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI750000_E64_H1_L2_ds1337_is1337_ckpt750000/hijack_breaking_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI750000_E64_H1_L2_ds1337_is1337_ckpt750000/hijack_breaking_rate_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI750000_E64_H1_L2_ds1337_is1337_ckpt750000/hijack_breaking_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI750000_E64_H1_L2_ds1337_is1337_ckpt750000/hijack_hijack_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI750000_E64_H1_L2_ds1337_is1337_ckpt750000/hijack_hijack_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI750000_E64_H1_L2_ds1337_is1337_ckpt750000/hijack_sample_count_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI750000_E64_H1_L2_ds1337_is1337_ckpt750000/intensity_layer0_asym_ub60_lb60.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI750000_E64_H1_L2_ds1337_is1337_ckpt750000/intervention_pernumber_random_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI750000_E64_H1_L2_ds1337_is1337_ckpt750000/intervention_pernumber_separator_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI800000_E64_H1_L2_ds1337_is1337_ckpt800000/ablation_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI800000_E64_H1_L2_ds1337_is1337_ckpt800000/ablation_per_position.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI800000_E64_H1_L2_ds1337_is1337_ckpt800000/attn_heatmaps.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI800000_E64_H1_L2_ds1337_is1337_ckpt800000/avg_attn_by_number_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI800000_E64_H1_L2_ds1337_is1337_ckpt800000/avg_attn_by_number_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI800000_E64_H1_L2_ds1337_is1337_ckpt800000/baseline_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI800000_E64_H1_L2_ds1337_is1337_ckpt800000/cinclogits_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI800000_E64_H1_L2_ds1337_is1337_ckpt800000/cinclogits_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI800000_E64_H1_L2_ds1337_is1337_ckpt800000/hijack_breaking_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI800000_E64_H1_L2_ds1337_is1337_ckpt800000/hijack_breaking_rate_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI800000_E64_H1_L2_ds1337_is1337_ckpt800000/hijack_breaking_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI800000_E64_H1_L2_ds1337_is1337_ckpt800000/hijack_hijack_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI800000_E64_H1_L2_ds1337_is1337_ckpt800000/hijack_hijack_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI800000_E64_H1_L2_ds1337_is1337_ckpt800000/hijack_sample_count_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI800000_E64_H1_L2_ds1337_is1337_ckpt800000/intensity_layer0_asym_ub60_lb60.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI800000_E64_H1_L2_ds1337_is1337_ckpt800000/intervention_pernumber_random_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI800000_E64_H1_L2_ds1337_is1337_ckpt800000/intervention_pernumber_separator_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI850000_E64_H1_L2_ds1337_is1337_ckpt850000/ablation_accuracy.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI850000_E64_H1_L2_ds1337_is1337_ckpt850000/ablation_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI850000_E64_H1_L2_ds1337_is1337_ckpt850000/ablation_per_position.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI850000_E64_H1_L2_ds1337_is1337_ckpt850000/attn_heatmaps.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI850000_E64_H1_L2_ds1337_is1337_ckpt850000/avg_attn_by_number_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI850000_E64_H1_L2_ds1337_is1337_ckpt850000/avg_attn_by_number_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI850000_E64_H1_L2_ds1337_is1337_ckpt850000/baseline_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI850000_E64_H1_L2_ds1337_is1337_ckpt850000/cinclogits_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI850000_E64_H1_L2_ds1337_is1337_ckpt850000/cinclogits_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI850000_E64_H1_L2_ds1337_is1337_ckpt850000/hijack_breaking_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI850000_E64_H1_L2_ds1337_is1337_ckpt850000/hijack_breaking_rate_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI850000_E64_H1_L2_ds1337_is1337_ckpt850000/hijack_breaking_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI850000_E64_H1_L2_ds1337_is1337_ckpt850000/hijack_hijack_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI850000_E64_H1_L2_ds1337_is1337_ckpt850000/hijack_hijack_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI850000_E64_H1_L2_ds1337_is1337_ckpt850000/intensity_layer0_asym_ub60_lb60.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI850000_E64_H1_L2_ds1337_is1337_ckpt850000/intervention_pernumber_random_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI850000_E64_H1_L2_ds1337_is1337_ckpt850000/intervention_pernumber_separator_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI900000_E64_H1_L2_ds1337_is1337_ckpt900000/ablation_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI900000_E64_H1_L2_ds1337_is1337_ckpt900000/ablation_per_position.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI900000_E64_H1_L2_ds1337_is1337_ckpt900000/attn_heatmaps.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI900000_E64_H1_L2_ds1337_is1337_ckpt900000/avg_attn_by_number_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI900000_E64_H1_L2_ds1337_is1337_ckpt900000/avg_attn_by_number_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI900000_E64_H1_L2_ds1337_is1337_ckpt900000/baseline_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI900000_E64_H1_L2_ds1337_is1337_ckpt900000/cinclogits_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI900000_E64_H1_L2_ds1337_is1337_ckpt900000/cinclogits_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI900000_E64_H1_L2_ds1337_is1337_ckpt900000/hijack_breaking_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI900000_E64_H1_L2_ds1337_is1337_ckpt900000/hijack_breaking_rate_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI900000_E64_H1_L2_ds1337_is1337_ckpt900000/hijack_breaking_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI900000_E64_H1_L2_ds1337_is1337_ckpt900000/hijack_hijack_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI900000_E64_H1_L2_ds1337_is1337_ckpt900000/hijack_hijack_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI900000_E64_H1_L2_ds1337_is1337_ckpt900000/intensity_layer0_asym_ub60_lb60.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI900000_E64_H1_L2_ds1337_is1337_ckpt900000/intervention_pernumber_random_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI900000_E64_H1_L2_ds1337_is1337_ckpt900000/intervention_pernumber_separator_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI950000_E64_H1_L2_ds1337_is1337_ckpt950000/ablation_accuracy.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI950000_E64_H1_L2_ds1337_is1337_ckpt950000/ablation_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI950000_E64_H1_L2_ds1337_is1337_ckpt950000/ablation_per_position.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI950000_E64_H1_L2_ds1337_is1337_ckpt950000/attn_heatmaps.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI950000_E64_H1_L2_ds1337_is1337_ckpt950000/avg_attn_by_number_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI950000_E64_H1_L2_ds1337_is1337_ckpt950000/avg_attn_by_number_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI950000_E64_H1_L2_ds1337_is1337_ckpt950000/baseline_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI950000_E64_H1_L2_ds1337_is1337_ckpt950000/cinclogits_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI950000_E64_H1_L2_ds1337_is1337_ckpt950000/cinclogits_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI950000_E64_H1_L2_ds1337_is1337_ckpt950000/hijack_breaking_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI950000_E64_H1_L2_ds1337_is1337_ckpt950000/hijack_breaking_rate_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI950000_E64_H1_L2_ds1337_is1337_ckpt950000/hijack_breaking_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI950000_E64_H1_L2_ds1337_is1337_ckpt950000/hijack_hijack_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI950000_E64_H1_L2_ds1337_is1337_ckpt950000/hijack_hijack_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI950000_E64_H1_L2_ds1337_is1337_ckpt950000/hijack_sample_count_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI950000_E64_H1_L2_ds1337_is1337_ckpt950000/intensity_layer0_asym_ub60_lb60.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI950000_E64_H1_L2_ds1337_is1337_ckpt950000/intervention_pernumber_random_layer0.png filter=lfs diff=lfs merge=lfs -text
+outputs/plots_V256_B16_LR3e-2_MI950000_E64_H1_L2_ds1337_is1337_ckpt950000/intervention_pernumber_separator_layer0.png filter=lfs diff=lfs merge=lfs -text

__pycache__/model_tbyt_train.cpython-312.pyc ADDED Viewed

Binary file (9.79 kB). View file

attn_by_number_worker.py ADDED Viewed

	@@ -0,0 +1,194 @@

+#!/usr/bin/env python3
+"""
+Worker: compute average attention-by-number-value heatmap (256×256)
+for both attention layers of each assigned checkpoint.
+For each token position with value i, we accumulate its attention weights
+to all visible positions with value j, then normalize by the total count
+of from-positions with value i. The result: avg_matrix[i,j] ≈ fraction
+of attention that number i pays to number j (rows sum to ~1).
+"""
+import argparse, json, os, sys, time, types
+import numpy as np
+import torch
+import torch.nn.functional as F
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'grid-run'))
+from model_analysis import GPT, GPTConfig
+VOCAB_SIZE = 256
+BLOCK_SIZE = 16
+SEQ_LEN = 2 * BLOCK_SIZE + 1  # 33
+N_LAYERS = 2
+BATCH_SIZE = 1024
+N_BATCHES = 100  # 102 400 sequences total
+def remap_state_dict(sd):
+    new_sd = {}
+    for key, val in sd.items():
+        new_key = key
+        for i in range(10):
+            new_key = new_key.replace(f'transformer.h.{i}.attn.', f'transformer.h.{i}.c_attn.')
+            new_key = new_key.replace(f'transformer.h.{i}.mlp.', f'transformer.h.{i}.c_fc.')
+        new_sd[new_key] = val
+    return new_sd
+def load_model(ckpt_path, device):
+    ckpt = torch.load(ckpt_path, map_location='cpu', weights_only=False)
+    mc = ckpt['model_config']
+    vocab_size = mc['vocab_size'] - 1
+    block_size = mc['block_size']
+    config = GPTConfig(block_size=block_size, vocab_size=vocab_size,
+                       with_layer_norm=mc.get('use_final_LN', True))
+    model = GPT(config)
+    sd = remap_state_dict(ckpt['model_state_dict'])
+    wpe_max = block_size * 4 + 1
+    if 'transformer.wpe.weight' in sd and sd['transformer.wpe.weight'].shape[0] > wpe_max:
+        sd['transformer.wpe.weight'] = sd['transformer.wpe.weight'][:wpe_max]
+    for k in [k for k in sd if k.endswith('.c_attn.bias') and 'c_attn.c_attn' not in k]:
+        del sd[k]
+    sd.pop('lm_head.weight', None)
+    model.load_state_dict(sd, strict=False)
+    model.to(device).eval()
+    return model, config
+def patch_attention(model):
+    """Replace forward so it stores batched attention weights (B, 1, T, T)."""
+    for layer_idx in range(N_LAYERS):
+        attn_mod = model.transformer.h[layer_idx].c_attn
+        def _make():
+            def fwd(self_attn, x, layer_n=-1):
+                B, T, C = x.size()
+                qkv = self_attn.c_attn(x)
+                q, k, v = qkv.split(self_attn.n_embd, dim=2)
+                q = q.view(B, T, self_attn.n_heads, C // self_attn.n_heads).transpose(1, 2)
+                k = k.view(B, T, self_attn.n_heads, C // self_attn.n_heads).transpose(1, 2)
+                v = v.view(B, T, self_attn.n_heads, C // self_attn.n_heads).transpose(1, 2)
+                a = q @ k.transpose(-1, -2) * 0.1 / k.size(-1) ** 0.5
+                a = a.masked_fill(self_attn.bias[:, :, :T, :T] == 0, float('-inf'))
+                a = F.softmax(a, dim=-1)
+                self_attn.batched_attn = a
+                y = (a @ v).transpose(1, 2).contiguous().view(B, T, C)
+                return self_attn.c_proj(y)
+            return fwd
+        attn_mod.forward = types.MethodType(_make(), attn_mod)
+def get_batch(device):
+    ids = torch.rand(BATCH_SIZE, VOCAB_SIZE, device=device).argsort(dim=1)[:, :BLOCK_SIZE]
+    sorted_ids, _ = ids.sort(dim=1)
+    sep = torch.full((BATCH_SIZE, 1), VOCAB_SIZE, dtype=torch.long, device=device)
+    return torch.cat([ids, sep, sorted_ids], dim=1)
+@torch.no_grad()
+def compute(model, device):
+    T = SEQ_LEN
+    VS = VOCAB_SIZE
+    causal = torch.tril(torch.ones(T, T, device=device, dtype=torch.bool))
+    sum_mat = [torch.zeros(VS * VS, device=device, dtype=torch.float64) for _ in range(N_LAYERS)]
+    from_cnt = torch.zeros(VS, device=device, dtype=torch.float64)
+    for _ in range(N_BATCHES):
+        tokens = get_batch(device)
+        model(tokens)
+        from_v = tokens.unsqueeze(2).expand(-1, -1, T)
+        to_v = tokens.unsqueeze(1).expand(-1, T, -1)
+        valid = causal.unsqueeze(0) & (from_v < VS) & (to_v < VS)
+        flat_idx = (from_v * VS + to_v).long()
+        idx_v = flat_idx[valid]
+        for layer in range(N_LAYERS):
+            attn = model.transformer.h[layer].c_attn.batched_attn[:, 0]
+            sum_mat[layer].scatter_add_(0, idx_v, attn[valid].double())
+        tok_valid = tokens[tokens < VS]
+        from_cnt.scatter_add_(0, tok_valid.long(),
+                              torch.ones(tok_valid.numel(), device=device, dtype=torch.float64))
+    results = []
+    fc = from_cnt.clamp(min=1).unsqueeze(1)
+    for layer in range(N_LAYERS):
+        avg = (sum_mat[layer].view(VS, VS) / fc).cpu().numpy()
+        results.append(avg)
+    return results, from_cnt.cpu().numpy()
+def plot_heatmap(avg, layer, out_dir, ckpt_label):
+    fig, ax = plt.subplots(figsize=(10, 9))
+    pos_vals = avg[avg > 0]
+    vmax = np.percentile(pos_vals, 99) if pos_vals.size > 0 else 1.0
+    im = ax.imshow(avg, aspect='auto', origin='lower', cmap='inferno',
+                   vmin=0, vmax=vmax, interpolation='nearest')
+    ax.set_xlabel('To number (attended-to)', fontsize=12)
+    ax.set_ylabel('From number (attending)', fontsize=12)
+    ax.set_title(f'Layer {layer+1}: avg attention  (number → number)\n{ckpt_label}',
+                 fontsize=11)
+    ticks = list(range(0, 256, 32)) + [255]
+    ax.set_xticks(ticks)
+    ax.set_yticks(ticks)
+    fig.colorbar(im, ax=ax, shrink=0.82, label='Avg attention weight')
+    plt.tight_layout()
+    path = os.path.join(out_dir, f'avg_attn_by_number_layer{layer}.png')
+    fig.savefig(path, dpi=150, bbox_inches='tight')
+    plt.close(fig)
+    return path
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument('--tasks-file', required=True)
+    ap.add_argument('--gpu', type=int, required=True)
+    args = ap.parse_args()
+    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)
+    device = 'cuda'
+    with open(args.tasks_file) as f:
+        tasks = json.load(f)
+    print(f"GPU {args.gpu}: {len(tasks)} checkpoints", flush=True)
+    for task in tasks:
+        ckpt_path = task['ckpt_path']
+        out_dir = task['out_dir']
+        label = os.path.basename(ckpt_path).replace('.pt', '')
+        done0 = os.path.exists(os.path.join(out_dir, 'avg_attn_by_number_layer0.png'))
+        done1 = os.path.exists(os.path.join(out_dir, 'avg_attn_by_number_layer1.png'))
+        if done0 and done1:
+            print(f"  Skip (exists): {label}", flush=True)
+            continue
+        t0 = time.time()
+        model, _ = load_model(ckpt_path, device)
+        patch_attention(model)
+        avgs, from_cnt = compute(model, device)
+        os.makedirs(out_dir, exist_ok=True)
+        for layer in range(N_LAYERS):
+            np.savez(os.path.join(out_dir, f'avg_attn_by_number_layer{layer}.npz'),
+                     avg_attn=avgs[layer], from_count=from_cnt)
+            plot_heatmap(avgs[layer], layer, out_dir, label)
+        dt = time.time() - t0
+        print(f"  Done: {label} ({dt:.1f}s)", flush=True)
+        del model
+        torch.cuda.empty_cache()
+    print(f"GPU {args.gpu}: all done.", flush=True)
+if __name__ == '__main__':
+    main()

gpu_worker.py CHANGED Viewed

@@ -1,23 +1,25 @@
 """
-GPU worker: processes a batch of analysis tasks on a single GPU.
-Model is loaded once per checkpoint and reused for all tasks on that checkpoint.
-Prints JSON status lines so the launcher can track progress.
 """
 import argparse
 import json
 import os
 import sys
 import time
 import numpy as np
 import torch
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'grid-run'))
 from model_analysis import GPT, GPTConfig, GPTIntervention
-def remap_state_dict(sd_100k):
     new_sd = {}
-    for key, val in sd_100k.items():
         new_key = key
         for i in range(10):
             new_key = new_key.replace(f'transformer.h.{i}.attn.', f'transformer.h.{i}.c_attn.')
@@ -59,6 +61,75 @@ def get_batch(vocab_size, block_size, device='cpu'):
     return torch.cat((x, torch.tensor([vocab_size]), vals), dim=0).unsqueeze(0).to(device)
 def compute_cinclogits(model, config, device, attn_layer, num_tries=100):
     bs = config.block_size
     vs = config.vocab_size
@@ -87,7 +158,10 @@ def compute_cinclogits(model, config, device, attn_layer, num_tries=100):
     return acc_cl / num_tries, acc_icl / num_tries
-def compute_intensity(model, config, device, attn_layer, ub=5, min_valid=200):
     bs = config.block_size
     vs = config.vocab_size
     location = bs + 5
@@ -102,8 +176,8 @@ def compute_intensity(model, config, device, attn_layer, ub=5, min_valid=200):
                 im = GPTIntervention(model, idx)
                 im.intervent_attention(
                     attention_layer_num=attn_layer, location=location,
-                    unsorted_lb=ub, unsorted_ub=ub,
-                    unsorted_lb_num=0, unsorted_ub_num=1,
                     unsorted_intensity_inc=intens,
                     sorted_lb=0, sorted_num=0, sorted_intensity_inc=0.0)
                 g, n = im.check_if_still_works()
@@ -116,106 +190,197 @@ def compute_intensity(model, config, device, attn_layer, ub=5, min_valid=200):
     return np.array(intensities), np.array(rates), np.array(counts)
-def compute_ablation(model, config, device, skip_layer, num_trials=500):
     bs = config.block_size
     vs = config.vocab_size
-    block = model.transformer.h[skip_layer]
-    orig_fwd = block.forward
-    def skip_attn(x, layer_n=-1):
-        return x + block.c_fc(block.ln_2(x))
-    block.forward = skip_attn
-    pp = np.zeros(bs)
-    fs = 0
-    cc = np.zeros(bs)
-    ce = np.zeros(bs)
-    try:
-        for _ in range(num_trials):
-            idx = get_batch(vs, bs, device)
             with torch.no_grad():
                 logits, _ = model(idx)
-            preds = torch.argmax(logits[0, bs:2*bs, :], dim=1)
-            targets = idx[0, bs+1:]
-            correct = (preds == targets).cpu().numpy()
-            pp += correct
-            if correct.all():
-                fs += 1
-            ok = True
-            for i in range(bs):
-                if ok:
-                    ce[i] += 1
-                    if correct[i]:
-                        cc[i] += 1
-                    else:
-                        ok = False
-                else:
-                    break
-    finally:
-        block.forward = orig_fwd
-    return pp / num_trials, fs / num_trials, np.where(ce > 0, cc / ce, 0.0), ce
-def compute_baseline(model, config, device, num_trials=500):
     bs = config.block_size
     vs = config.vocab_size
-    pp = np.zeros(bs)
-    fs = 0
-    cc = np.zeros(bs)
-    ce = np.zeros(bs)
-    for _ in range(num_trials):
         idx = get_batch(vs, bs, device)
         with torch.no_grad():
             logits, _ = model(idx)
-        preds = torch.argmax(logits[0, bs:2*bs, :], dim=1)
-        targets = idx[0, bs+1:]
-        correct = (preds == targets).cpu().numpy()
-        pp += correct
-        if correct.all():
-            fs += 1
-        ok = True
-        for i in range(bs):
-            if ok:
-                ce[i] += 1
-                if correct[i]:
-                    cc[i] += 1
-                else:
-                    ok = False
-            else:
-                break
-    return pp / num_trials, fs / num_trials, np.where(ce > 0, cc / ce, 0.0), ce
-def process_task(task, model, config, device, out_dir, itr):
     task_type = task['type']
     out_path = task['out']
     if os.path.exists(out_path):
         return True
     os.makedirs(os.path.dirname(out_path), exist_ok=True)
     if task_type == 'baseline':
         pp, fs, ca, ce = compute_baseline(model, config, device)
         np.savez(out_path, per_pos_acc=pp, full_seq_acc=fs,
                  cond_acc=ca, cond_eligible=ce, itr=itr)
     elif task_type == 'ablation':
         pp, fs, ca, ce = compute_ablation(model, config, device, task['layer'])
         np.savez(out_path, per_pos_acc=pp, full_seq_acc=fs,
                  cond_acc=ca, cond_eligible=ce, skip_layer=task['layer'], itr=itr)
     elif task_type == 'cinclogits':
         cl, icl = compute_cinclogits(model, config, device, task['layer'])
         np.savez(out_path, clogit_icscore=cl, iclogit_icscore=icl, itr=itr)
     elif task_type == 'intensity':
         intensities, rates, counts = compute_intensity(
             model, config, device, task['layer'], ub=task['ub'])
         np.savez(out_path, intensities=intensities, success_rates=rates,
                  counts=counts, itr=itr)
     return True
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--tasks-file', required=True, help='JSON file with task list')
     parser.add_argument('--gpu', type=int, required=True)
     args = parser.parse_args()
@@ -225,8 +390,7 @@ def main():
     with open(args.tasks_file) as f:
         task_list = json.load(f)
-    print(f"GPU {args.gpu}: {len(task_list)} tasks across "
-          f"{len(set(t['ckpt_path'] for t in task_list))} checkpoints", flush=True)
     current_model = None
     current_ckpt = None
@@ -234,21 +398,18 @@ def main():
     for task in task_list:
         ckpt_path = task['ckpt_path']
         if ckpt_path != current_ckpt:
             t0 = time.time()
             model, config = load_model(ckpt_path, device)
             current_model = model
             current_ckpt = ckpt_path
-            itr = task.get('itr', 100000)
             print(f"  Loaded {os.path.basename(ckpt_path)} ({time.time()-t0:.1f}s)", flush=True)
         t0 = time.time()
         try:
-            process_task(task, current_model, config, device, None, itr)
             dt = time.time() - t0
             done += 1
-            # Print status as JSON for launcher to parse
             print(json.dumps({
                 'status': 'done', 'task': task['name'],
                 'gpu': args.gpu, 'elapsed': round(dt, 1),
@@ -261,6 +422,8 @@ def main():
                 'gpu': args.gpu, 'error': str(e)
             }), flush=True)
 if __name__ == '__main__':
     main()

 """
+GPU worker for 1000k-checkpoint analysis.
+Processes all task types on a single GPU: baseline, ablation, cinclogits,
+intensity (various ub), asymmetric intensity, hijack, separator/random.
 """
 import argparse
 import json
 import os
 import sys
 import time
+import types
 import numpy as np
 import torch
+import torch.nn.functional as F
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'grid-run'))
 from model_analysis import GPT, GPTConfig, GPTIntervention
+def remap_state_dict(sd):
     new_sd = {}
+    for key, val in sd.items():
         new_key = key
         for i in range(10):
             new_key = new_key.replace(f'transformer.h.{i}.attn.', f'transformer.h.{i}.c_attn.')
     return torch.cat((x, torch.tensor([vocab_size]), vals), dim=0).unsqueeze(0).to(device)
+def compute_baseline(model, config, device, num_trials=500):
+    bs = config.block_size
+    vs = config.vocab_size
+    pp = np.zeros(bs)
+    fs = 0
+    cc = np.zeros(bs)
+    ce = np.zeros(bs)
+    for _ in range(num_trials):
+        idx = get_batch(vs, bs, device)
+        with torch.no_grad():
+            logits, _ = model(idx)
+        preds = torch.argmax(logits[0, bs:2*bs, :], dim=1)
+        targets = idx[0, bs+1:]
+        correct = (preds == targets).cpu().numpy()
+        pp += correct
+        if correct.all():
+            fs += 1
+        ok = True
+        for i in range(bs):
+            if ok:
+                ce[i] += 1
+                if correct[i]:
+                    cc[i] += 1
+                else:
+                    ok = False
+            else:
+                break
+    return pp / num_trials, fs / num_trials, np.where(ce > 0, cc / ce, 0.0), ce
+def compute_ablation(model, config, device, skip_layer, num_trials=500):
+    bs = config.block_size
+    block = model.transformer.h[skip_layer]
+    orig_fwd = block.forward
+    def skip_attn(x, layer_n=-1):
+        return x + block.c_fc(block.ln_2(x))
+    block.forward = skip_attn
+    pp = np.zeros(bs)
+    fs = 0
+    cc = np.zeros(bs)
+    ce = np.zeros(bs)
+    try:
+        for _ in range(num_trials):
+            idx = get_batch(config.vocab_size, bs, device)
+            with torch.no_grad():
+                logits, _ = model(idx)
+            preds = torch.argmax(logits[0, bs:2*bs, :], dim=1)
+            targets = idx[0, bs+1:]
+            correct = (preds == targets).cpu().numpy()
+            pp += correct
+            if correct.all():
+                fs += 1
+            ok = True
+            for i in range(bs):
+                if ok:
+                    ce[i] += 1
+                    if correct[i]:
+                        cc[i] += 1
+                    else:
+                        ok = False
+                else:
+                    break
+    finally:
+        block.forward = orig_fwd
+    return pp / num_trials, fs / num_trials, np.where(ce > 0, cc / ce, 0.0), ce
 def compute_cinclogits(model, config, device, attn_layer, num_tries=100):
     bs = config.block_size
     vs = config.vocab_size
     return acc_cl / num_tries, acc_icl / num_tries
+def compute_intensity(model, config, device, attn_layer, ub=5, lb=None,
+                      ub_num=1, lb_num=0, min_valid=200):
+    if lb is None:
+        lb = ub
     bs = config.block_size
     vs = config.vocab_size
     location = bs + 5
                 im = GPTIntervention(model, idx)
                 im.intervent_attention(
                     attention_layer_num=attn_layer, location=location,
+                    unsorted_lb=lb, unsorted_ub=ub,
+                    unsorted_lb_num=lb_num, unsorted_ub_num=ub_num,
                     unsorted_intensity_inc=intens,
                     sorted_lb=0, sorted_num=0, sorted_intensity_inc=0.0)
                 g, n = im.check_if_still_works()
     return np.array(intensities), np.array(rates), np.array(counts)
+def compute_hijack(model, config, device, n_trials=2000):
+    """Hijack intervention on layer 0. Returns array of (current, boosted, predicted, correct)."""
+    INTENSITY = 10.0
     bs = config.block_size
     vs = config.vocab_size
+    attn_module = model.transformer.h[0].c_attn
+    records = []
+    for trial in range(n_trials):
+        idx = get_batch(vs, bs, device)
+        unsorted = idx[0, :bs]
+        sorted_part = idx[0, bs + 1: 2 * bs + 1]
+        with torch.no_grad():
+            _, _ = model(idx)
+        raw_attn = attn_module.raw_attn.clone()
+        for p in range(bs - 1):
+            location = bs + 1 + p
+            current_num = sorted_part[p].item()
+            correct_next = idx[0, location + 1].item()
+            next_loc_in_unsorted = (unsorted == correct_next).nonzero(as_tuple=True)[0]
+            if len(next_loc_in_unsorted) == 0:
+                continue
+            next_loc = next_loc_in_unsorted[0].item()
+            main_attn_val = raw_attn[location, next_loc].item()
+            candidates = [i for i in range(bs) if unsorted[i].item() != correct_next]
+            if not candidates:
+                continue
+            boost_idx = candidates[torch.randint(len(candidates), (1,)).item()]
+            boosted_number = unsorted[boost_idx].item()
+            def make_new_forward(loc, bidx, mav):
+                def new_forward(self_attn, x, layer_n=-1):
+                    B, T, C = x.size()
+                    qkv = self_attn.c_attn(x)
+                    q, k, v = qkv.split(self_attn.n_embd, dim=2)
+                    q = q.view(B, T, self_attn.n_heads, C // self_attn.n_heads).transpose(1, 2)
+                    k = k.view(B, T, self_attn.n_heads, C // self_attn.n_heads).transpose(1, 2)
+                    v = v.view(B, T, self_attn.n_heads, C // self_attn.n_heads).transpose(1, 2)
+                    attn = q @ k.transpose(-1, -2) * 0.1 / (k.size(-1)) ** 0.5
+                    attn[:, :, loc, bidx] = mav + INTENSITY
+                    attn = attn.masked_fill(self_attn.bias[:, :, :T, :T] == 0, float('-inf'))
+                    attn = F.softmax(attn, dim=-1)
+                    y = attn @ v
+                    y = y.transpose(1, 2).contiguous().view(B, T, C)
+                    y = self_attn.c_proj(y)
+                    return y
+                return new_forward
+            old_forward = attn_module.forward
+            attn_module.forward = types.MethodType(
+                make_new_forward(location, boost_idx, main_attn_val), attn_module)
             with torch.no_grad():
                 logits, _ = model(idx)
+            predicted = torch.argmax(logits, dim=-1)[0, location].item()
+            attn_module.forward = old_forward
+            records.append((current_num, boosted_number, predicted, correct_next))
+    return np.array(records, dtype=np.int32) if records else np.empty((0, 4), dtype=np.int32)
+def compute_separator_random(model, config, device, n_trials=1000):
+    """Separator-attention and random-target intervention on layer 0."""
+    INTENSITIES = [2.0, 6.0, 10.0]
+    UB_STANDARD = 60
     bs = config.block_size
     vs = config.vocab_size
+    sep_pos = bs
+    sep_records = []
+    rand_records = []
+    for trial in range(n_trials):
         idx = get_batch(vs, bs, device)
         with torch.no_grad():
             logits, _ = model(idx)
+        attn_layer0 = model.transformer.h[0].c_attn.attn
+        for p in range(bs - 1):
+            sorted_loc = bs + 1 + p
+            number_val = idx[0, sorted_loc].item()
+            attn_row = attn_layer0[sorted_loc, :sorted_loc + 1]
+            max_attn_pos = attn_row.argmax().item()
+            attends_to_sep = (max_attn_pos == sep_pos)
+            for intensity in INTENSITIES:
+                if attends_to_sep:
+                    try:
+                        im = GPTIntervention(model, idx)
+                        im.intervent_attention(
+                            attention_layer_num=0, location=sorted_loc,
+                            unsorted_lb=UB_STANDARD, unsorted_ub=UB_STANDARD,
+                            unsorted_lb_num=0, unsorted_ub_num=1,
+                            unsorted_intensity_inc=intensity,
+                            sorted_lb=0, sorted_num=0, sorted_intensity_inc=0.0)
+                        g, n = im.check_if_still_works()
+                        im.revert_attention(0)
+                        sep_records.append((number_val, intensity, int(g == n)))
+                    except:
+                        pass
+                try:
+                    im = GPTIntervention(model, idx)
+                    im.intervent_attention(
+                        attention_layer_num=0, location=sorted_loc,
+                        unsorted_lb=0, unsorted_ub=vs,
+                        unsorted_lb_num=0, unsorted_ub_num=1,
+                        unsorted_intensity_inc=intensity,
+                        sorted_lb=0, sorted_num=0, sorted_intensity_inc=0.0)
+                    g, n = im.check_if_still_works()
+                    im.revert_attention(0)
+                    rand_records.append((number_val, intensity, int(g == n)))
+                except:
+                    try:
+                        im = GPTIntervention(model, idx)
+                        im.intervent_attention(
+                            attention_layer_num=0, location=sorted_loc,
+                            unsorted_lb=vs, unsorted_ub=0,
+                            unsorted_lb_num=1, unsorted_ub_num=0,
+                            unsorted_intensity_inc=intensity,
+                            sorted_lb=0, sorted_num=0, sorted_intensity_inc=0.0)
+                        g, n = im.check_if_still_works()
+                        im.revert_attention(0)
+                        rand_records.append((number_val, intensity, int(g == n)))
+                    except:
+                        pass
+    sep = np.array(sep_records, dtype=np.int32) if sep_records else np.empty((0, 3), dtype=np.int32)
+    rand = np.array(rand_records, dtype=np.int32) if rand_records else np.empty((0, 3), dtype=np.int32)
+    return sep, rand
+def process_task(task, model, config, device):
     task_type = task['type']
     out_path = task['out']
     if os.path.exists(out_path):
         return True
     os.makedirs(os.path.dirname(out_path), exist_ok=True)
+    itr = task.get('itr', 0)
     if task_type == 'baseline':
         pp, fs, ca, ce = compute_baseline(model, config, device)
         np.savez(out_path, per_pos_acc=pp, full_seq_acc=fs,
                  cond_acc=ca, cond_eligible=ce, itr=itr)
     elif task_type == 'ablation':
         pp, fs, ca, ce = compute_ablation(model, config, device, task['layer'])
         np.savez(out_path, per_pos_acc=pp, full_seq_acc=fs,
                  cond_acc=ca, cond_eligible=ce, skip_layer=task['layer'], itr=itr)
     elif task_type == 'cinclogits':
         cl, icl = compute_cinclogits(model, config, device, task['layer'])
         np.savez(out_path, clogit_icscore=cl, iclogit_icscore=icl, itr=itr)
     elif task_type == 'intensity':
         intensities, rates, counts = compute_intensity(
             model, config, device, task['layer'], ub=task['ub'])
         np.savez(out_path, intensities=intensities, success_rates=rates,
                  counts=counts, itr=itr)
+    elif task_type == 'intensity_asym':
+        intensities, rates, counts = compute_intensity(
+            model, config, device, task['layer'],
+            ub=task['unsorted_ub'], lb=task['unsorted_lb'],
+            ub_num=task['unsorted_ub_num'], lb_num=task['unsorted_lb_num'])
+        np.savez(out_path, intensities=intensities, success_rates=rates,
+                 counts=counts, itr=itr)
+    elif task_type == 'hijack':
+        data = compute_hijack(model, config, device, n_trials=task.get('trials', 2000))
+        np.savez(out_path, data=data)
+    elif task_type == 'separator_random':
+        sep, rand = compute_separator_random(model, config, device,
+                                             n_trials=task.get('trials', 1000))
+        np.savez(out_path, sep_data=sep, rand_data=rand)
     return True
 def main():
     parser = argparse.ArgumentParser()
+    parser.add_argument('--tasks-file', required=True)
     parser.add_argument('--gpu', type=int, required=True)
     args = parser.parse_args()
     with open(args.tasks_file) as f:
         task_list = json.load(f)
+    print(f"GPU {args.gpu}: {len(task_list)} tasks", flush=True)
     current_model = None
     current_ckpt = None
     for task in task_list:
         ckpt_path = task['ckpt_path']
         if ckpt_path != current_ckpt:
             t0 = time.time()
             model, config = load_model(ckpt_path, device)
             current_model = model
             current_ckpt = ckpt_path
             print(f"  Loaded {os.path.basename(ckpt_path)} ({time.time()-t0:.1f}s)", flush=True)
         t0 = time.time()
         try:
+            process_task(task, current_model, config, device)
             dt = time.time() - t0
             done += 1
             print(json.dumps({
                 'status': 'done', 'task': task['name'],
                 'gpu': args.gpu, 'elapsed': round(dt, 1),
                 'gpu': args.gpu, 'error': str(e)
             }), flush=True)
+    print(f"GPU {args.gpu}: all done ({done}/{len(task_list)})", flush=True)
 if __name__ == '__main__':
     main()

hijack_layer1_worker.py ADDED Viewed

	@@ -0,0 +1,229 @@

+#!/usr/bin/env python3
+"""
+Worker: run hijack intervention on layer 1 for a single checkpoint and plot heatmaps.
+Usage: python hijack_layer1_worker.py <checkpoint.pt> --output-dir <dir>
+"""
+import argparse
+import os
+import sys
+import types
+import numpy as np
+import torch
+import torch.nn.functional as F
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'grid-run'))
+from model_analysis import GPT, GPTConfig
+BIN_SIZE = 8
+N_BINS = 256 // BIN_SIZE
+INTENSITY = 10.0
+LAYER = 1
+N_TRIALS = 2000
+def remap_state_dict(sd):
+    new_sd = {}
+    for key, val in sd.items():
+        new_key = key
+        for i in range(10):
+            new_key = new_key.replace(f'transformer.h.{i}.attn.', f'transformer.h.{i}.c_attn.')
+            new_key = new_key.replace(f'transformer.h.{i}.mlp.', f'transformer.h.{i}.c_fc.')
+        new_sd[new_key] = val
+    return new_sd
+def load_model(ckpt_path, device):
+    ckpt = torch.load(ckpt_path, map_location='cpu')
+    mc = ckpt['model_config']
+    vocab_size = mc['vocab_size'] - 1
+    block_size = mc['block_size']
+    with_layer_norm = mc.get('use_final_LN', True)
+    config = GPTConfig(block_size=block_size, vocab_size=vocab_size,
+                       with_layer_norm=with_layer_norm)
+    model = GPT(config)
+    sd = remap_state_dict(ckpt['model_state_dict'])
+    grid_wpe_size = block_size * 4 + 1
+    if 'transformer.wpe.weight' in sd and sd['transformer.wpe.weight'].shape[0] > grid_wpe_size:
+        sd['transformer.wpe.weight'] = sd['transformer.wpe.weight'][:grid_wpe_size]
+    keys_to_skip = [k for k in sd if k.endswith('.c_attn.bias') and 'c_attn.c_attn' not in k]
+    for k in keys_to_skip:
+        del sd[k]
+    if 'lm_head.weight' in sd:
+        del sd['lm_head.weight']
+    model.load_state_dict(sd, strict=False)
+    model.to(device).eval()
+    return model, config
+def get_batch(vocab_size, block_size, device='cpu'):
+    x = torch.randperm(vocab_size)[:block_size]
+    vals, _ = torch.sort(x)
+    return torch.cat((x, torch.tensor([vocab_size]), vals), dim=0).unsqueeze(0).to(device)
+def compute_hijack(model, config, device):
+    bs = config.block_size
+    vs = config.vocab_size
+    attn_module = model.transformer.h[LAYER].c_attn
+    records = []
+    for trial in range(N_TRIALS):
+        idx = get_batch(vs, bs, device)
+        unsorted = idx[0, :bs]
+        sorted_part = idx[0, bs + 1: 2 * bs + 1]
+        with torch.no_grad():
+            _, _ = model(idx)
+        raw_attn = attn_module.raw_attn.clone()
+        for p in range(bs - 1):
+            location = bs + 1 + p
+            current_num = sorted_part[p].item()
+            correct_next = idx[0, location + 1].item()
+            next_loc_in_unsorted = (unsorted == correct_next).nonzero(as_tuple=True)[0]
+            if len(next_loc_in_unsorted) == 0:
+                continue
+            next_loc = next_loc_in_unsorted[0].item()
+            main_attn_val = raw_attn[location, next_loc].item()
+            candidates = [i for i in range(bs) if unsorted[i].item() != correct_next]
+            if not candidates:
+                continue
+            boost_idx = candidates[torch.randint(len(candidates), (1,)).item()]
+            boosted_number = unsorted[boost_idx].item()
+            def make_new_forward(loc, bidx, mav):
+                def new_forward(self_attn, x, layer_n=-1):
+                    B, T, C = x.size()
+                    qkv = self_attn.c_attn(x)
+                    q, k, v = qkv.split(self_attn.n_embd, dim=2)
+                    q = q.view(B, T, self_attn.n_heads, C // self_attn.n_heads).transpose(1, 2)
+                    k = k.view(B, T, self_attn.n_heads, C // self_attn.n_heads).transpose(1, 2)
+                    v = v.view(B, T, self_attn.n_heads, C // self_attn.n_heads).transpose(1, 2)
+                    attn = q @ k.transpose(-1, -2) * 0.1 / (k.size(-1)) ** 0.5
+                    attn[:, :, loc, bidx] = mav + INTENSITY
+                    attn = attn.masked_fill(self_attn.bias[:, :, :T, :T] == 0, float('-inf'))
+                    attn = F.softmax(attn, dim=-1)
+                    y = attn @ v
+                    y = y.transpose(1, 2).contiguous().view(B, T, C)
+                    y = self_attn.c_proj(y)
+                    return y
+                return new_forward
+            old_forward = attn_module.forward
+            attn_module.forward = types.MethodType(
+                make_new_forward(location, boost_idx, main_attn_val), attn_module)
+            with torch.no_grad():
+                logits, _ = model(idx)
+            predicted = torch.argmax(logits, dim=-1)[0, location].item()
+            attn_module.forward = old_forward
+            records.append((current_num, boosted_number, predicted, correct_next))
+    return np.array(records, dtype=np.int32) if records else np.empty((0, 4), dtype=np.int32)
+def plot_heatmaps(data, plot_dir, tag):
+    if len(data) == 0:
+        print("No data to plot!")
+        return
+    current = data[:, 0]; boosted = data[:, 1]
+    predicted = data[:, 2]; correct = data[:, 3]
+    broken = (predicted != correct).astype(np.float64)
+    hijacked = (predicted == boosted).astype(np.float64)
+    cur_bin = np.clip(current // BIN_SIZE, 0, N_BINS - 1)
+    bst_bin = np.clip(boosted // BIN_SIZE, 0, N_BINS - 1)
+    break_map = np.full((N_BINS, N_BINS), np.nan)
+    hijack_map = np.full((N_BINS, N_BINS), np.nan)
+    count_map = np.zeros((N_BINS, N_BINS), dtype=int)
+    for cb in range(N_BINS):
+        for bb in range(N_BINS):
+            mask = (cur_bin == cb) & (bst_bin == bb)
+            n = mask.sum()
+            count_map[cb, bb] = n
+            if n >= 5:
+                break_map[cb, bb] = broken[mask].mean()
+                hijack_map[cb, bb] = hijacked[mask].mean()
+    tick_labels = [f'{i * BIN_SIZE}' for i in range(0, N_BINS, 4)]
+    tick_positions = list(range(0, N_BINS, 4))
+    for arr, cmap, label, fname in [
+        (break_map, 'YlOrRd', 'Breaking Rate',
+         f'hijack_breaking_rate_heatmap_layer{LAYER}.png'),
+        (hijack_map, 'YlOrRd', 'Hijack Rate',
+         f'hijack_hijack_rate_heatmap_layer{LAYER}.png'),
+    ]:
+        fig, ax = plt.subplots(figsize=(10, 8.5))
+        im = ax.imshow(arr, aspect='auto', cmap=cmap, vmin=0, vmax=1,
+                       interpolation='nearest', origin='lower')
+        ax.set_xlabel('Intervened-toward Number (binned)', fontsize=12)
+        ax.set_ylabel('Current Number (binned)', fontsize=12)
+        title_map = {'Breaking Rate': 'Breaking Rate: P(pred \u2260 correct)',
+                     'Hijack Rate': 'Hijack Rate: P(pred == intervened target)'}
+        ax.set_title(f'{title_map[label]}\n{tag}  layer={LAYER}  intensity={INTENSITY}',
+                     fontsize=12, fontweight='bold')
+        ax.set_xticks(tick_positions); ax.set_xticklabels(tick_labels, fontsize=8)
+        ax.set_yticks(tick_positions); ax.set_yticklabels(tick_labels, fontsize=8)
+        plt.colorbar(im, ax=ax, label=label, shrink=0.85)
+        fig.tight_layout()
+        fig.savefig(os.path.join(plot_dir, fname), dpi=200, bbox_inches='tight')
+        plt.close()
+        print(f"Saved: {fname}")
+    fig, ax = plt.subplots(figsize=(10, 8.5))
+    im = ax.imshow(count_map, aspect='auto', cmap='viridis',
+                   interpolation='nearest', origin='lower')
+    ax.set_xlabel('Intervened-toward Number (binned)', fontsize=12)
+    ax.set_ylabel('Current Number (binned)', fontsize=12)
+    ax.set_title(f'Sample Count per (current, target) bin\n{tag}  layer={LAYER}  intensity={INTENSITY}',
+                 fontsize=11, fontweight='bold')
+    ax.set_xticks(tick_positions); ax.set_xticklabels(tick_labels, fontsize=8)
+    ax.set_yticks(tick_positions); ax.set_yticklabels(tick_labels, fontsize=8)
+    plt.colorbar(im, ax=ax, label='Count', shrink=0.85)
+    fig.tight_layout()
+    fname = f'hijack_sample_count_heatmap_layer{LAYER}.png'
+    fig.savefig(os.path.join(plot_dir, fname), dpi=200, bbox_inches='tight')
+    plt.close()
+    print(f"Saved: {fname}")
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('checkpoint', type=str)
+    parser.add_argument('--output-dir', type=str, required=True)
+    args = parser.parse_args()
+    device = 'cuda'
+    os.makedirs(args.output_dir, exist_ok=True)
+    print(f"Loading {os.path.basename(args.checkpoint)} ...", flush=True)
+    model, config = load_model(args.checkpoint, device)
+    print(f"Running hijack layer {LAYER} ({N_TRIALS} trials) ...", flush=True)
+    data = compute_hijack(model, config, device)
+    print(f"Collected {len(data)} records", flush=True)
+    bn = os.path.basename(args.checkpoint).replace('.pt', '')
+    parts = bn.split('__')
+    ckpt_type = parts[1] if len(parts) > 1 else 'final'
+    itr = int(ckpt_type.replace('ckpt', '')) if ckpt_type.startswith('ckpt') else 1000000
+    tag = f"V=256  B=16  lr=0.03  iters={itr}  dseed=1337  iseed=1337"
+    plot_heatmaps(data, args.output_dir, tag)
+if __name__ == '__main__':
+    main()

model_tbyt_train.py ADDED Viewed

	@@ -0,0 +1,123 @@

+"""
+Model matching the 200k-checkpoints architecture exactly.
+Block uses self.attn / self.mlp naming (matching 200k state dict).
+max_seq_len configurable (200k model uses 193).
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+class MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.fc_1 = nn.Linear(config.n_embd, 3 * config.n_embd)
+        self.gelu = nn.GELU(approximate='tanh')
+        self.fc_2 = nn.Linear(config.n_embd * 3, config.n_embd)
+        self.NANO_SCALE_GPT = True
+    def forward(self, x):
+        return self.fc_2(self.gelu(self.fc_1(x)))
+class CasualSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.n_embd = config.n_embd
+        self.n_heads = config.n_heads
+        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
+        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
+        seq_len = config.max_seq_len
+        self.register_buffer('bias', torch.tril(torch.ones(seq_len, seq_len)).view(1, 1, seq_len, seq_len))
+        self.c_proj.NANOGPT_SCALE_INIT = True
+        self.config = config
+    def forward(self, x, layer_n=-1):
+        B, T, C = x.size()
+        qkv = self.c_attn(x)
+        q, k, v = qkv.split(self.n_embd, dim=2)
+        q = q.view(B, T, self.n_heads, C // self.n_heads).transpose(1, 2)
+        k = k.view(B, T, self.n_heads, C // self.n_heads).transpose(1, 2)
+        v = v.view(B, T, self.n_heads, C // self.n_heads).transpose(1, 2)
+        attn = q @ k.transpose(-1, -2) * 0.1 / (k.size(-1)) ** 0.5
+        attn = attn.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
+        attn = F.softmax(attn, dim=-1)
+        y = attn @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+        y = self.c_proj(y)
+        return y
+class Block(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.attn = CasualSelfAttention(config)
+        self.mlp = MLP(config)
+        self.ln_1 = nn.LayerNorm(config.n_embd)
+        self.ln_2 = nn.LayerNorm(config.n_embd)
+    def forward(self, x, layer_n=-1):
+        x = x + self.attn(self.ln_1(x), layer_n=layer_n)
+        return x + self.mlp(self.ln_2(x))
+class GPT(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.n_layers = config.n_layers
+        self.transformer = nn.ModuleDict(dict(
+            wte=nn.Embedding(config.vocab_size + 1, config.n_embd),
+            wpe=nn.Embedding(config.max_seq_len, config.n_embd),
+            h=nn.ModuleList([Block(config) for _ in range(config.n_layers)]),
+            ln_f=nn.LayerNorm(config.n_embd)
+        ))
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        self.lm_head.weight = self.transformer.wte.weight
+        self.apply(self._init_weights)
+    def _init_weights(self, module):
+        std = 0.02
+        if isinstance(module, nn.Linear):
+            if hasattr(module, 'NANOGPT_SCALE_INIT'):
+                std *= (2 * self.n_layers) ** -0.5
+            torch.nn.init.normal_(module.weight, mean=0, std=std)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        if isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0, std=std)
+    def forward(self, idx, targets=None, flag=False):
+        B, T = idx.size()
+        x = self.transformer.wte(idx)
+        layer_n = 0
+        for block in self.transformer.h:
+            layer_n += 1
+            x = block(x, layer_n)
+        if self.config.with_layer_norm:
+            x = self.transformer.ln_f(x)
+        logits = self.lm_head(x)
+        tensor1 = logits[:, self.config.block_size:T - 1, :].contiguous().view(-1, logits.size(-1))
+        tensor2 = idx[:, self.config.block_size + 1:].contiguous().view(-1)
+        loss = F.cross_entropy(tensor1, tensor2)
+        return logits, loss
+class GPTConfig:
+    block_size: int = 16
+    vocab_size: int = 256
+    n_layers: int = 2
+    n_heads: int = 1
+    n_embd: int = 64
+    with_layer_norm: bool = True
+    max_seq_len: int = 193
+    def __init__(self, block_size=None, vocab_size=None, with_layer_norm=True, max_seq_len=193):
+        if block_size is not None:
+            self.block_size = block_size
+        if vocab_size is not None:
+            self.vocab_size = vocab_size
+        self.with_layer_norm = with_layer_norm
+        self.max_seq_len = max_seq_len

outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/ablation_accuracy.png ADDED Viewed

outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/ablation_conditional_accuracy.png ADDED Viewed

Git LFS Details

SHA256: fae10f51c5b6b25cfc31bb8cbb6e3e29483888cdee11137553af94f2f95e6651
Pointer size: 131 Bytes
Size of remote file: 137 kB

outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/ablation_per_position.png ADDED Viewed

Git LFS Details

SHA256: 280d85628c5e03d612d1f0ed485ea7acbcd8bc374a65c1278a64ddbca1a47dd7
Pointer size: 131 Bytes
Size of remote file: 124 kB

outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/attn_heatmaps.png ADDED Viewed

Git LFS Details

SHA256: fd088ed11b1ae6cbba95371546304b47b4fee2423b855b67807771531b9bc619
Pointer size: 131 Bytes
Size of remote file: 117 kB

outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/avg_attn_by_number_layer0.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b3f1680ae8653c687e1aa97f956d9f07eeb969bb1404087097c4c1a799ecb6fe
+size 526858

outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/avg_attn_by_number_layer0.png ADDED Viewed

Git LFS Details

SHA256: e44904c556676242f1333664df3520613af6f86b1f3b5ad99abed7665fdacb27
Pointer size: 131 Bytes
Size of remote file: 144 kB

outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/avg_attn_by_number_layer1.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2174fce6c178f07e77275564dc56614aa037372e9610bdb52af9623b5f0680a3
+size 526858

outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/avg_attn_by_number_layer1.png ADDED Viewed

Git LFS Details

SHA256: 87bfd7e2a3d42e66320e03576f0a5917d09194888360bd188829b4d08a4bc97a
Pointer size: 131 Bytes
Size of remote file: 186 kB

outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/baseline_accuracy.png ADDED Viewed

outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/baseline_conditional_accuracy.png ADDED Viewed

Git LFS Details

SHA256: fb85a2c46c790ed4f8b7a63b8dfdae49ae3db9ec2fa9eaafeaa91c7675d1692b
Pointer size: 131 Bytes
Size of remote file: 104 kB

outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/cinclogits_layer0.png ADDED Viewed

Git LFS Details

SHA256: a9988ed5bb5feed6b466aa5e73bccad5a03cf02452d1ce911b1177f284aba129
Pointer size: 131 Bytes
Size of remote file: 104 kB

outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/cinclogits_layer1.png ADDED Viewed

Git LFS Details

SHA256: 7b7160129b67bef3f0a4786a97fe6901a066e52ae79f93384558cf16ef3945c3
Pointer size: 131 Bytes
Size of remote file: 101 kB

outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/hijack_breaking_rate_bynext_heatmap_layer0.png ADDED Viewed

Git LFS Details

SHA256: 7e17a431eedda876718314acc3b2b181c98d4b25fbfc5d433f2cf09fd4525ad0
Pointer size: 131 Bytes
Size of remote file: 107 kB

outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/hijack_breaking_rate_heatmap_layer0.png ADDED Viewed

Git LFS Details

SHA256: e3c5106599602dc40b469c264416b738bc7366e1807717995cba204fa73b32a7
Pointer size: 131 Bytes
Size of remote file: 100 kB

outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/hijack_breaking_rate_heatmap_layer1.png ADDED Viewed

outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/hijack_hijack_rate_bynext_heatmap_layer0.png ADDED Viewed

Git LFS Details

SHA256: 43840a185a0a85ede33ce834c9a2db3de9f66e12e2b92db7a8f13b9d0d1d323b
Pointer size: 131 Bytes
Size of remote file: 103 kB

outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/hijack_hijack_rate_heatmap_layer0.png ADDED Viewed

outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/hijack_hijack_rate_heatmap_layer1.png ADDED Viewed

Git LFS Details

SHA256: 1ac9805a37a5f9f22a5796793e3d9d7ddb4d5509ab15e59a1b069dfb4a494ef7
Pointer size: 131 Bytes
Size of remote file: 103 kB

outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/hijack_sample_count_heatmap_layer0.png ADDED Viewed

outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/hijack_sample_count_heatmap_layer1.png ADDED Viewed

Git LFS Details

SHA256: 6b0c1f28efc8e3d8ce28ab5868fca7fb41f4ee2f756c063e564c05daa935a2c9
Pointer size: 131 Bytes
Size of remote file: 100 kB

outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer0.png ADDED Viewed

outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer0_asym_ub60_lb60.png ADDED Viewed

Git LFS Details

SHA256: bd63c5a606a82715f394bccd4305470c8f7ffefd5533f28ead4c2a4b12f707b3
Pointer size: 131 Bytes
Size of remote file: 123 kB

outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer0_ub10.png ADDED Viewed

outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer0_ub15.png ADDED Viewed

outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer0_ub20.png ADDED Viewed

outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer0_ub30.png ADDED Viewed

outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer0_ub50.png ADDED Viewed

outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer0_ub60.png ADDED Viewed

outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer0_ub60_high.png ADDED Viewed

outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer1.png ADDED Viewed

outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer1_ub10.png ADDED Viewed

outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer1_ub15.png ADDED Viewed

outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer1_ub20.png ADDED Viewed

outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer1_ub30.png ADDED Viewed

outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer1_ub50.png ADDED Viewed

outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer1_ub60.png ADDED Viewed

outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer1_ub60_high.png ADDED Viewed

outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intervention_pernumber_random_layer0.png ADDED Viewed

Git LFS Details

SHA256: 0f067dad4da72649cb8a137002051042df7001f419e0c76cb8be47ae0241e3ab
Pointer size: 131 Bytes
Size of remote file: 340 kB

outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intervention_pernumber_separator_layer0.png ADDED Viewed

Git LFS Details

SHA256: 954494723b1475b0fb7d55f56f35e1a1393e68a3efada62959188705e2b238c3
Pointer size: 131 Bytes
Size of remote file: 104 kB

outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/ablation_accuracy.png ADDED Viewed

outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/ablation_conditional_accuracy.png ADDED Viewed

Git LFS Details

SHA256: 198a1f5de4526f3811c7b77f3c29f45fa1269150c1a13181888a8a481e592319
Pointer size: 131 Bytes
Size of remote file: 161 kB

outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/ablation_per_position.png ADDED Viewed

Git LFS Details

SHA256: 50a8324c292b9013c133d054290d55165cd96a6744117b256b13cfcad0121a3f
Pointer size: 131 Bytes
Size of remote file: 149 kB

outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/attn_heatmaps.png ADDED Viewed

Git LFS Details

SHA256: 3f4db97c84a4a4fa5167788465b10623c2fb0b6fd077fcc5f3413c0715d17634
Pointer size: 131 Bytes
Size of remote file: 117 kB

outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/avg_attn_by_number_layer0.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eebc87b61451f35001b6cb56a92cb9129d97f7bed18c2b02e539917f108967b3
+size 526858