gatmiry commited on
Commit
c7f1373
·
verified ·
1 Parent(s): beda614

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +329 -0
  2. __pycache__/model_tbyt_train.cpython-312.pyc +0 -0
  3. attn_by_number_worker.py +194 -0
  4. gpu_worker.py +236 -73
  5. hijack_layer1_worker.py +229 -0
  6. model_tbyt_train.py +123 -0
  7. outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/ablation_accuracy.png +0 -0
  8. outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/ablation_conditional_accuracy.png +3 -0
  9. outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/ablation_per_position.png +3 -0
  10. outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/attn_heatmaps.png +3 -0
  11. outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/avg_attn_by_number_layer0.npz +3 -0
  12. outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/avg_attn_by_number_layer0.png +3 -0
  13. outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/avg_attn_by_number_layer1.npz +3 -0
  14. outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/avg_attn_by_number_layer1.png +3 -0
  15. outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/baseline_accuracy.png +0 -0
  16. outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/baseline_conditional_accuracy.png +3 -0
  17. outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/cinclogits_layer0.png +3 -0
  18. outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/cinclogits_layer1.png +3 -0
  19. outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/hijack_breaking_rate_bynext_heatmap_layer0.png +3 -0
  20. outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/hijack_breaking_rate_heatmap_layer0.png +3 -0
  21. outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/hijack_breaking_rate_heatmap_layer1.png +0 -0
  22. outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/hijack_hijack_rate_bynext_heatmap_layer0.png +3 -0
  23. outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/hijack_hijack_rate_heatmap_layer0.png +0 -0
  24. outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/hijack_hijack_rate_heatmap_layer1.png +3 -0
  25. outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/hijack_sample_count_heatmap_layer0.png +0 -0
  26. outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/hijack_sample_count_heatmap_layer1.png +3 -0
  27. outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer0.png +0 -0
  28. outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer0_asym_ub60_lb60.png +3 -0
  29. outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer0_ub10.png +0 -0
  30. outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer0_ub15.png +0 -0
  31. outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer0_ub20.png +0 -0
  32. outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer0_ub30.png +0 -0
  33. outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer0_ub50.png +0 -0
  34. outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer0_ub60.png +0 -0
  35. outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer0_ub60_high.png +0 -0
  36. outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer1.png +0 -0
  37. outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer1_ub10.png +0 -0
  38. outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer1_ub15.png +0 -0
  39. outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer1_ub20.png +0 -0
  40. outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer1_ub30.png +0 -0
  41. outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer1_ub50.png +0 -0
  42. outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer1_ub60.png +0 -0
  43. outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer1_ub60_high.png +0 -0
  44. outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intervention_pernumber_random_layer0.png +3 -0
  45. outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intervention_pernumber_separator_layer0.png +3 -0
  46. outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/ablation_accuracy.png +0 -0
  47. outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/ablation_conditional_accuracy.png +3 -0
  48. outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/ablation_per_position.png +3 -0
  49. outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/attn_heatmaps.png +3 -0
  50. outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/avg_attn_by_number_layer0.npz +3 -0
.gitattributes CHANGED
@@ -278,3 +278,332 @@ outputs/plots_N256_B16_ds1338_is1340_final/perlocation/perlocation_layer0.png fi
278
  outputs/plots_N256_B16_ds1338_is1340_final/perlocation/perlocation_layer1.png filter=lfs diff=lfs merge=lfs -text
279
  outputs/plots_N256_B16_ds1338_is1340_final/pernumber/pernumber_layer0.png filter=lfs diff=lfs merge=lfs -text
280
  outputs/plots_N256_B16_ds1338_is1340_final/pernumber/pernumber_layer1.png filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
278
  outputs/plots_N256_B16_ds1338_is1340_final/perlocation/perlocation_layer1.png filter=lfs diff=lfs merge=lfs -text
279
  outputs/plots_N256_B16_ds1338_is1340_final/pernumber/pernumber_layer0.png filter=lfs diff=lfs merge=lfs -text
280
  outputs/plots_N256_B16_ds1338_is1340_final/pernumber/pernumber_layer1.png filter=lfs diff=lfs merge=lfs -text
281
+ outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/ablation_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
282
+ outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/ablation_per_position.png filter=lfs diff=lfs merge=lfs -text
283
+ outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/attn_heatmaps.png filter=lfs diff=lfs merge=lfs -text
284
+ outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/avg_attn_by_number_layer0.png filter=lfs diff=lfs merge=lfs -text
285
+ outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/avg_attn_by_number_layer1.png filter=lfs diff=lfs merge=lfs -text
286
+ outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/baseline_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
287
+ outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/cinclogits_layer0.png filter=lfs diff=lfs merge=lfs -text
288
+ outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/cinclogits_layer1.png filter=lfs diff=lfs merge=lfs -text
289
+ outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/hijack_breaking_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
290
+ outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/hijack_breaking_rate_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
291
+ outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/hijack_hijack_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
292
+ outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/hijack_hijack_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
293
+ outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/hijack_sample_count_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
294
+ outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer0_asym_ub60_lb60.png filter=lfs diff=lfs merge=lfs -text
295
+ outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intervention_pernumber_random_layer0.png filter=lfs diff=lfs merge=lfs -text
296
+ outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intervention_pernumber_separator_layer0.png filter=lfs diff=lfs merge=lfs -text
297
+ outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/ablation_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
298
+ outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/ablation_per_position.png filter=lfs diff=lfs merge=lfs -text
299
+ outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/attn_heatmaps.png filter=lfs diff=lfs merge=lfs -text
300
+ outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/avg_attn_by_number_layer0.png filter=lfs diff=lfs merge=lfs -text
301
+ outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/avg_attn_by_number_layer1.png filter=lfs diff=lfs merge=lfs -text
302
+ outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/baseline_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
303
+ outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/cinclogits_layer0.png filter=lfs diff=lfs merge=lfs -text
304
+ outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/cinclogits_layer1.png filter=lfs diff=lfs merge=lfs -text
305
+ outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/hijack_breaking_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
306
+ outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/hijack_breaking_rate_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
307
+ outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/hijack_breaking_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
308
+ outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/hijack_hijack_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
309
+ outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/hijack_hijack_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
310
+ outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/intensity_layer0_asym_ub60_lb60.png filter=lfs diff=lfs merge=lfs -text
311
+ outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/intervention_pernumber_random_layer0.png filter=lfs diff=lfs merge=lfs -text
312
+ outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/intervention_pernumber_separator_layer0.png filter=lfs diff=lfs merge=lfs -text
313
+ outputs/plots_V256_B16_LR3e-2_MI150000_E64_H1_L2_ds1337_is1337_ckpt150000/ablation_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
314
+ outputs/plots_V256_B16_LR3e-2_MI150000_E64_H1_L2_ds1337_is1337_ckpt150000/ablation_per_position.png filter=lfs diff=lfs merge=lfs -text
315
+ outputs/plots_V256_B16_LR3e-2_MI150000_E64_H1_L2_ds1337_is1337_ckpt150000/attn_heatmaps.png filter=lfs diff=lfs merge=lfs -text
316
+ outputs/plots_V256_B16_LR3e-2_MI150000_E64_H1_L2_ds1337_is1337_ckpt150000/avg_attn_by_number_layer1.png filter=lfs diff=lfs merge=lfs -text
317
+ outputs/plots_V256_B16_LR3e-2_MI150000_E64_H1_L2_ds1337_is1337_ckpt150000/baseline_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
318
+ outputs/plots_V256_B16_LR3e-2_MI150000_E64_H1_L2_ds1337_is1337_ckpt150000/cinclogits_layer0.png filter=lfs diff=lfs merge=lfs -text
319
+ outputs/plots_V256_B16_LR3e-2_MI150000_E64_H1_L2_ds1337_is1337_ckpt150000/cinclogits_layer1.png filter=lfs diff=lfs merge=lfs -text
320
+ outputs/plots_V256_B16_LR3e-2_MI150000_E64_H1_L2_ds1337_is1337_ckpt150000/hijack_breaking_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
321
+ outputs/plots_V256_B16_LR3e-2_MI150000_E64_H1_L2_ds1337_is1337_ckpt150000/hijack_breaking_rate_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
322
+ outputs/plots_V256_B16_LR3e-2_MI150000_E64_H1_L2_ds1337_is1337_ckpt150000/hijack_breaking_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
323
+ outputs/plots_V256_B16_LR3e-2_MI150000_E64_H1_L2_ds1337_is1337_ckpt150000/hijack_hijack_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
324
+ outputs/plots_V256_B16_LR3e-2_MI150000_E64_H1_L2_ds1337_is1337_ckpt150000/hijack_hijack_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
325
+ outputs/plots_V256_B16_LR3e-2_MI150000_E64_H1_L2_ds1337_is1337_ckpt150000/hijack_sample_count_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
326
+ outputs/plots_V256_B16_LR3e-2_MI150000_E64_H1_L2_ds1337_is1337_ckpt150000/intensity_layer0_asym_ub60_lb60.png filter=lfs diff=lfs merge=lfs -text
327
+ outputs/plots_V256_B16_LR3e-2_MI150000_E64_H1_L2_ds1337_is1337_ckpt150000/intervention_pernumber_random_layer0.png filter=lfs diff=lfs merge=lfs -text
328
+ outputs/plots_V256_B16_LR3e-2_MI150000_E64_H1_L2_ds1337_is1337_ckpt150000/intervention_pernumber_separator_layer0.png filter=lfs diff=lfs merge=lfs -text
329
+ outputs/plots_V256_B16_LR3e-2_MI200000_E64_H1_L2_ds1337_is1337_ckpt200000/ablation_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
330
+ outputs/plots_V256_B16_LR3e-2_MI200000_E64_H1_L2_ds1337_is1337_ckpt200000/ablation_per_position.png filter=lfs diff=lfs merge=lfs -text
331
+ outputs/plots_V256_B16_LR3e-2_MI200000_E64_H1_L2_ds1337_is1337_ckpt200000/attn_heatmaps.png filter=lfs diff=lfs merge=lfs -text
332
+ outputs/plots_V256_B16_LR3e-2_MI200000_E64_H1_L2_ds1337_is1337_ckpt200000/avg_attn_by_number_layer1.png filter=lfs diff=lfs merge=lfs -text
333
+ outputs/plots_V256_B16_LR3e-2_MI200000_E64_H1_L2_ds1337_is1337_ckpt200000/baseline_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
334
+ outputs/plots_V256_B16_LR3e-2_MI200000_E64_H1_L2_ds1337_is1337_ckpt200000/cinclogits_layer0.png filter=lfs diff=lfs merge=lfs -text
335
+ outputs/plots_V256_B16_LR3e-2_MI200000_E64_H1_L2_ds1337_is1337_ckpt200000/cinclogits_layer1.png filter=lfs diff=lfs merge=lfs -text
336
+ outputs/plots_V256_B16_LR3e-2_MI200000_E64_H1_L2_ds1337_is1337_ckpt200000/hijack_breaking_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
337
+ outputs/plots_V256_B16_LR3e-2_MI200000_E64_H1_L2_ds1337_is1337_ckpt200000/hijack_breaking_rate_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
338
+ outputs/plots_V256_B16_LR3e-2_MI200000_E64_H1_L2_ds1337_is1337_ckpt200000/hijack_breaking_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
339
+ outputs/plots_V256_B16_LR3e-2_MI200000_E64_H1_L2_ds1337_is1337_ckpt200000/hijack_hijack_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
340
+ outputs/plots_V256_B16_LR3e-2_MI200000_E64_H1_L2_ds1337_is1337_ckpt200000/hijack_hijack_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
341
+ outputs/plots_V256_B16_LR3e-2_MI200000_E64_H1_L2_ds1337_is1337_ckpt200000/intensity_layer0_asym_ub60_lb60.png filter=lfs diff=lfs merge=lfs -text
342
+ outputs/plots_V256_B16_LR3e-2_MI200000_E64_H1_L2_ds1337_is1337_ckpt200000/intervention_pernumber_random_layer0.png filter=lfs diff=lfs merge=lfs -text
343
+ outputs/plots_V256_B16_LR3e-2_MI200000_E64_H1_L2_ds1337_is1337_ckpt200000/intervention_pernumber_separator_layer0.png filter=lfs diff=lfs merge=lfs -text
344
+ outputs/plots_V256_B16_LR3e-2_MI250000_E64_H1_L2_ds1337_is1337_ckpt250000/ablation_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
345
+ outputs/plots_V256_B16_LR3e-2_MI250000_E64_H1_L2_ds1337_is1337_ckpt250000/ablation_per_position.png filter=lfs diff=lfs merge=lfs -text
346
+ outputs/plots_V256_B16_LR3e-2_MI250000_E64_H1_L2_ds1337_is1337_ckpt250000/attn_heatmaps.png filter=lfs diff=lfs merge=lfs -text
347
+ outputs/plots_V256_B16_LR3e-2_MI250000_E64_H1_L2_ds1337_is1337_ckpt250000/avg_attn_by_number_layer0.png filter=lfs diff=lfs merge=lfs -text
348
+ outputs/plots_V256_B16_LR3e-2_MI250000_E64_H1_L2_ds1337_is1337_ckpt250000/avg_attn_by_number_layer1.png filter=lfs diff=lfs merge=lfs -text
349
+ outputs/plots_V256_B16_LR3e-2_MI250000_E64_H1_L2_ds1337_is1337_ckpt250000/baseline_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
350
+ outputs/plots_V256_B16_LR3e-2_MI250000_E64_H1_L2_ds1337_is1337_ckpt250000/cinclogits_layer0.png filter=lfs diff=lfs merge=lfs -text
351
+ outputs/plots_V256_B16_LR3e-2_MI250000_E64_H1_L2_ds1337_is1337_ckpt250000/cinclogits_layer1.png filter=lfs diff=lfs merge=lfs -text
352
+ outputs/plots_V256_B16_LR3e-2_MI250000_E64_H1_L2_ds1337_is1337_ckpt250000/hijack_breaking_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
353
+ outputs/plots_V256_B16_LR3e-2_MI250000_E64_H1_L2_ds1337_is1337_ckpt250000/hijack_breaking_rate_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
354
+ outputs/plots_V256_B16_LR3e-2_MI250000_E64_H1_L2_ds1337_is1337_ckpt250000/hijack_breaking_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
355
+ outputs/plots_V256_B16_LR3e-2_MI250000_E64_H1_L2_ds1337_is1337_ckpt250000/hijack_hijack_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
356
+ outputs/plots_V256_B16_LR3e-2_MI250000_E64_H1_L2_ds1337_is1337_ckpt250000/hijack_hijack_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
357
+ outputs/plots_V256_B16_LR3e-2_MI250000_E64_H1_L2_ds1337_is1337_ckpt250000/hijack_sample_count_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
358
+ outputs/plots_V256_B16_LR3e-2_MI250000_E64_H1_L2_ds1337_is1337_ckpt250000/intensity_layer0_asym_ub60_lb60.png filter=lfs diff=lfs merge=lfs -text
359
+ outputs/plots_V256_B16_LR3e-2_MI250000_E64_H1_L2_ds1337_is1337_ckpt250000/intervention_pernumber_random_layer0.png filter=lfs diff=lfs merge=lfs -text
360
+ outputs/plots_V256_B16_LR3e-2_MI300000_E64_H1_L2_ds1337_is1337_ckpt300000/ablation_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
361
+ outputs/plots_V256_B16_LR3e-2_MI300000_E64_H1_L2_ds1337_is1337_ckpt300000/ablation_per_position.png filter=lfs diff=lfs merge=lfs -text
362
+ outputs/plots_V256_B16_LR3e-2_MI300000_E64_H1_L2_ds1337_is1337_ckpt300000/attn_heatmaps.png filter=lfs diff=lfs merge=lfs -text
363
+ outputs/plots_V256_B16_LR3e-2_MI300000_E64_H1_L2_ds1337_is1337_ckpt300000/avg_attn_by_number_layer0.png filter=lfs diff=lfs merge=lfs -text
364
+ outputs/plots_V256_B16_LR3e-2_MI300000_E64_H1_L2_ds1337_is1337_ckpt300000/avg_attn_by_number_layer1.png filter=lfs diff=lfs merge=lfs -text
365
+ outputs/plots_V256_B16_LR3e-2_MI300000_E64_H1_L2_ds1337_is1337_ckpt300000/baseline_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
366
+ outputs/plots_V256_B16_LR3e-2_MI300000_E64_H1_L2_ds1337_is1337_ckpt300000/cinclogits_layer0.png filter=lfs diff=lfs merge=lfs -text
367
+ outputs/plots_V256_B16_LR3e-2_MI300000_E64_H1_L2_ds1337_is1337_ckpt300000/cinclogits_layer1.png filter=lfs diff=lfs merge=lfs -text
368
+ outputs/plots_V256_B16_LR3e-2_MI300000_E64_H1_L2_ds1337_is1337_ckpt300000/hijack_breaking_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
369
+ outputs/plots_V256_B16_LR3e-2_MI300000_E64_H1_L2_ds1337_is1337_ckpt300000/hijack_breaking_rate_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
370
+ outputs/plots_V256_B16_LR3e-2_MI300000_E64_H1_L2_ds1337_is1337_ckpt300000/hijack_breaking_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
371
+ outputs/plots_V256_B16_LR3e-2_MI300000_E64_H1_L2_ds1337_is1337_ckpt300000/hijack_hijack_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
372
+ outputs/plots_V256_B16_LR3e-2_MI300000_E64_H1_L2_ds1337_is1337_ckpt300000/hijack_hijack_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
373
+ outputs/plots_V256_B16_LR3e-2_MI300000_E64_H1_L2_ds1337_is1337_ckpt300000/intensity_layer0_asym_ub60_lb60.png filter=lfs diff=lfs merge=lfs -text
374
+ outputs/plots_V256_B16_LR3e-2_MI300000_E64_H1_L2_ds1337_is1337_ckpt300000/intervention_pernumber_random_layer0.png filter=lfs diff=lfs merge=lfs -text
375
+ outputs/plots_V256_B16_LR3e-2_MI300000_E64_H1_L2_ds1337_is1337_ckpt300000/intervention_pernumber_separator_layer0.png filter=lfs diff=lfs merge=lfs -text
376
+ outputs/plots_V256_B16_LR3e-2_MI350000_E64_H1_L2_ds1337_is1337_ckpt350000/ablation_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
377
+ outputs/plots_V256_B16_LR3e-2_MI350000_E64_H1_L2_ds1337_is1337_ckpt350000/ablation_per_position.png filter=lfs diff=lfs merge=lfs -text
378
+ outputs/plots_V256_B16_LR3e-2_MI350000_E64_H1_L2_ds1337_is1337_ckpt350000/attn_heatmaps.png filter=lfs diff=lfs merge=lfs -text
379
+ outputs/plots_V256_B16_LR3e-2_MI350000_E64_H1_L2_ds1337_is1337_ckpt350000/avg_attn_by_number_layer0.png filter=lfs diff=lfs merge=lfs -text
380
+ outputs/plots_V256_B16_LR3e-2_MI350000_E64_H1_L2_ds1337_is1337_ckpt350000/avg_attn_by_number_layer1.png filter=lfs diff=lfs merge=lfs -text
381
+ outputs/plots_V256_B16_LR3e-2_MI350000_E64_H1_L2_ds1337_is1337_ckpt350000/baseline_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
382
+ outputs/plots_V256_B16_LR3e-2_MI350000_E64_H1_L2_ds1337_is1337_ckpt350000/cinclogits_layer0.png filter=lfs diff=lfs merge=lfs -text
383
+ outputs/plots_V256_B16_LR3e-2_MI350000_E64_H1_L2_ds1337_is1337_ckpt350000/cinclogits_layer1.png filter=lfs diff=lfs merge=lfs -text
384
+ outputs/plots_V256_B16_LR3e-2_MI350000_E64_H1_L2_ds1337_is1337_ckpt350000/hijack_breaking_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
385
+ outputs/plots_V256_B16_LR3e-2_MI350000_E64_H1_L2_ds1337_is1337_ckpt350000/hijack_breaking_rate_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
386
+ outputs/plots_V256_B16_LR3e-2_MI350000_E64_H1_L2_ds1337_is1337_ckpt350000/hijack_breaking_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
387
+ outputs/plots_V256_B16_LR3e-2_MI350000_E64_H1_L2_ds1337_is1337_ckpt350000/hijack_hijack_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
388
+ outputs/plots_V256_B16_LR3e-2_MI350000_E64_H1_L2_ds1337_is1337_ckpt350000/hijack_hijack_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
389
+ outputs/plots_V256_B16_LR3e-2_MI350000_E64_H1_L2_ds1337_is1337_ckpt350000/intensity_layer0_asym_ub60_lb60.png filter=lfs diff=lfs merge=lfs -text
390
+ outputs/plots_V256_B16_LR3e-2_MI350000_E64_H1_L2_ds1337_is1337_ckpt350000/intervention_pernumber_random_layer0.png filter=lfs diff=lfs merge=lfs -text
391
+ outputs/plots_V256_B16_LR3e-2_MI350000_E64_H1_L2_ds1337_is1337_ckpt350000/intervention_pernumber_separator_layer0.png filter=lfs diff=lfs merge=lfs -text
392
+ outputs/plots_V256_B16_LR3e-2_MI400000_E64_H1_L2_ds1337_is1337_ckpt400000/ablation_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
393
+ outputs/plots_V256_B16_LR3e-2_MI400000_E64_H1_L2_ds1337_is1337_ckpt400000/ablation_per_position.png filter=lfs diff=lfs merge=lfs -text
394
+ outputs/plots_V256_B16_LR3e-2_MI400000_E64_H1_L2_ds1337_is1337_ckpt400000/attn_heatmaps.png filter=lfs diff=lfs merge=lfs -text
395
+ outputs/plots_V256_B16_LR3e-2_MI400000_E64_H1_L2_ds1337_is1337_ckpt400000/avg_attn_by_number_layer0.png filter=lfs diff=lfs merge=lfs -text
396
+ outputs/plots_V256_B16_LR3e-2_MI400000_E64_H1_L2_ds1337_is1337_ckpt400000/avg_attn_by_number_layer1.png filter=lfs diff=lfs merge=lfs -text
397
+ outputs/plots_V256_B16_LR3e-2_MI400000_E64_H1_L2_ds1337_is1337_ckpt400000/baseline_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
398
+ outputs/plots_V256_B16_LR3e-2_MI400000_E64_H1_L2_ds1337_is1337_ckpt400000/cinclogits_layer0.png filter=lfs diff=lfs merge=lfs -text
399
+ outputs/plots_V256_B16_LR3e-2_MI400000_E64_H1_L2_ds1337_is1337_ckpt400000/cinclogits_layer1.png filter=lfs diff=lfs merge=lfs -text
400
+ outputs/plots_V256_B16_LR3e-2_MI400000_E64_H1_L2_ds1337_is1337_ckpt400000/hijack_breaking_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
401
+ outputs/plots_V256_B16_LR3e-2_MI400000_E64_H1_L2_ds1337_is1337_ckpt400000/hijack_breaking_rate_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
402
+ outputs/plots_V256_B16_LR3e-2_MI400000_E64_H1_L2_ds1337_is1337_ckpt400000/hijack_hijack_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
403
+ outputs/plots_V256_B16_LR3e-2_MI400000_E64_H1_L2_ds1337_is1337_ckpt400000/hijack_hijack_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
404
+ outputs/plots_V256_B16_LR3e-2_MI400000_E64_H1_L2_ds1337_is1337_ckpt400000/hijack_sample_count_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
405
+ outputs/plots_V256_B16_LR3e-2_MI400000_E64_H1_L2_ds1337_is1337_ckpt400000/intensity_layer0_asym_ub60_lb60.png filter=lfs diff=lfs merge=lfs -text
406
+ outputs/plots_V256_B16_LR3e-2_MI400000_E64_H1_L2_ds1337_is1337_ckpt400000/intervention_pernumber_random_layer0.png filter=lfs diff=lfs merge=lfs -text
407
+ outputs/plots_V256_B16_LR3e-2_MI400000_E64_H1_L2_ds1337_is1337_ckpt400000/intervention_pernumber_separator_layer0.png filter=lfs diff=lfs merge=lfs -text
408
+ outputs/plots_V256_B16_LR3e-2_MI450000_E64_H1_L2_ds1337_is1337_ckpt450000/ablation_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
409
+ outputs/plots_V256_B16_LR3e-2_MI450000_E64_H1_L2_ds1337_is1337_ckpt450000/ablation_per_position.png filter=lfs diff=lfs merge=lfs -text
410
+ outputs/plots_V256_B16_LR3e-2_MI450000_E64_H1_L2_ds1337_is1337_ckpt450000/attn_heatmaps.png filter=lfs diff=lfs merge=lfs -text
411
+ outputs/plots_V256_B16_LR3e-2_MI450000_E64_H1_L2_ds1337_is1337_ckpt450000/avg_attn_by_number_layer0.png filter=lfs diff=lfs merge=lfs -text
412
+ outputs/plots_V256_B16_LR3e-2_MI450000_E64_H1_L2_ds1337_is1337_ckpt450000/avg_attn_by_number_layer1.png filter=lfs diff=lfs merge=lfs -text
413
+ outputs/plots_V256_B16_LR3e-2_MI450000_E64_H1_L2_ds1337_is1337_ckpt450000/baseline_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
414
+ outputs/plots_V256_B16_LR3e-2_MI450000_E64_H1_L2_ds1337_is1337_ckpt450000/cinclogits_layer0.png filter=lfs diff=lfs merge=lfs -text
415
+ outputs/plots_V256_B16_LR3e-2_MI450000_E64_H1_L2_ds1337_is1337_ckpt450000/cinclogits_layer1.png filter=lfs diff=lfs merge=lfs -text
416
+ outputs/plots_V256_B16_LR3e-2_MI450000_E64_H1_L2_ds1337_is1337_ckpt450000/hijack_breaking_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
417
+ outputs/plots_V256_B16_LR3e-2_MI450000_E64_H1_L2_ds1337_is1337_ckpt450000/hijack_breaking_rate_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
418
+ outputs/plots_V256_B16_LR3e-2_MI450000_E64_H1_L2_ds1337_is1337_ckpt450000/hijack_breaking_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
419
+ outputs/plots_V256_B16_LR3e-2_MI450000_E64_H1_L2_ds1337_is1337_ckpt450000/hijack_hijack_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
420
+ outputs/plots_V256_B16_LR3e-2_MI450000_E64_H1_L2_ds1337_is1337_ckpt450000/hijack_hijack_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
421
+ outputs/plots_V256_B16_LR3e-2_MI450000_E64_H1_L2_ds1337_is1337_ckpt450000/hijack_sample_count_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
422
+ outputs/plots_V256_B16_LR3e-2_MI450000_E64_H1_L2_ds1337_is1337_ckpt450000/intensity_layer0_asym_ub60_lb60.png filter=lfs diff=lfs merge=lfs -text
423
+ outputs/plots_V256_B16_LR3e-2_MI450000_E64_H1_L2_ds1337_is1337_ckpt450000/intervention_pernumber_random_layer0.png filter=lfs diff=lfs merge=lfs -text
424
+ outputs/plots_V256_B16_LR3e-2_MI450000_E64_H1_L2_ds1337_is1337_ckpt450000/intervention_pernumber_separator_layer0.png filter=lfs diff=lfs merge=lfs -text
425
+ outputs/plots_V256_B16_LR3e-2_MI500000_E64_H1_L2_ds1337_is1337_ckpt500000/ablation_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
426
+ outputs/plots_V256_B16_LR3e-2_MI500000_E64_H1_L2_ds1337_is1337_ckpt500000/ablation_per_position.png filter=lfs diff=lfs merge=lfs -text
427
+ outputs/plots_V256_B16_LR3e-2_MI500000_E64_H1_L2_ds1337_is1337_ckpt500000/attn_heatmaps.png filter=lfs diff=lfs merge=lfs -text
428
+ outputs/plots_V256_B16_LR3e-2_MI500000_E64_H1_L2_ds1337_is1337_ckpt500000/avg_attn_by_number_layer0.png filter=lfs diff=lfs merge=lfs -text
429
+ outputs/plots_V256_B16_LR3e-2_MI500000_E64_H1_L2_ds1337_is1337_ckpt500000/avg_attn_by_number_layer1.png filter=lfs diff=lfs merge=lfs -text
430
+ outputs/plots_V256_B16_LR3e-2_MI500000_E64_H1_L2_ds1337_is1337_ckpt500000/baseline_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
431
+ outputs/plots_V256_B16_LR3e-2_MI500000_E64_H1_L2_ds1337_is1337_ckpt500000/cinclogits_layer0.png filter=lfs diff=lfs merge=lfs -text
432
+ outputs/plots_V256_B16_LR3e-2_MI500000_E64_H1_L2_ds1337_is1337_ckpt500000/hijack_breaking_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
433
+ outputs/plots_V256_B16_LR3e-2_MI500000_E64_H1_L2_ds1337_is1337_ckpt500000/hijack_breaking_rate_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
434
+ outputs/plots_V256_B16_LR3e-2_MI500000_E64_H1_L2_ds1337_is1337_ckpt500000/hijack_breaking_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
435
+ outputs/plots_V256_B16_LR3e-2_MI500000_E64_H1_L2_ds1337_is1337_ckpt500000/hijack_hijack_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
436
+ outputs/plots_V256_B16_LR3e-2_MI500000_E64_H1_L2_ds1337_is1337_ckpt500000/hijack_hijack_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
437
+ outputs/plots_V256_B16_LR3e-2_MI500000_E64_H1_L2_ds1337_is1337_ckpt500000/hijack_sample_count_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
438
+ outputs/plots_V256_B16_LR3e-2_MI500000_E64_H1_L2_ds1337_is1337_ckpt500000/intensity_layer0_asym_ub60_lb60.png filter=lfs diff=lfs merge=lfs -text
439
+ outputs/plots_V256_B16_LR3e-2_MI500000_E64_H1_L2_ds1337_is1337_ckpt500000/intervention_pernumber_random_layer0.png filter=lfs diff=lfs merge=lfs -text
440
+ outputs/plots_V256_B16_LR3e-2_MI500000_E64_H1_L2_ds1337_is1337_ckpt500000/intervention_pernumber_separator_layer0.png filter=lfs diff=lfs merge=lfs -text
441
+ outputs/plots_V256_B16_LR3e-2_MI50000_E64_H1_L2_ds1337_is1337_ckpt50000/ablation_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
442
+ outputs/plots_V256_B16_LR3e-2_MI50000_E64_H1_L2_ds1337_is1337_ckpt50000/ablation_per_position.png filter=lfs diff=lfs merge=lfs -text
443
+ outputs/plots_V256_B16_LR3e-2_MI50000_E64_H1_L2_ds1337_is1337_ckpt50000/attn_heatmaps.png filter=lfs diff=lfs merge=lfs -text
444
+ outputs/plots_V256_B16_LR3e-2_MI50000_E64_H1_L2_ds1337_is1337_ckpt50000/avg_attn_by_number_layer0.png filter=lfs diff=lfs merge=lfs -text
445
+ outputs/plots_V256_B16_LR3e-2_MI50000_E64_H1_L2_ds1337_is1337_ckpt50000/avg_attn_by_number_layer1.png filter=lfs diff=lfs merge=lfs -text
446
+ outputs/plots_V256_B16_LR3e-2_MI50000_E64_H1_L2_ds1337_is1337_ckpt50000/baseline_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
447
+ outputs/plots_V256_B16_LR3e-2_MI50000_E64_H1_L2_ds1337_is1337_ckpt50000/cinclogits_layer0.png filter=lfs diff=lfs merge=lfs -text
448
+ outputs/plots_V256_B16_LR3e-2_MI50000_E64_H1_L2_ds1337_is1337_ckpt50000/cinclogits_layer1.png filter=lfs diff=lfs merge=lfs -text
449
+ outputs/plots_V256_B16_LR3e-2_MI50000_E64_H1_L2_ds1337_is1337_ckpt50000/hijack_breaking_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
450
+ outputs/plots_V256_B16_LR3e-2_MI50000_E64_H1_L2_ds1337_is1337_ckpt50000/hijack_breaking_rate_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
451
+ outputs/plots_V256_B16_LR3e-2_MI50000_E64_H1_L2_ds1337_is1337_ckpt50000/hijack_breaking_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
452
+ outputs/plots_V256_B16_LR3e-2_MI50000_E64_H1_L2_ds1337_is1337_ckpt50000/hijack_hijack_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
453
+ outputs/plots_V256_B16_LR3e-2_MI50000_E64_H1_L2_ds1337_is1337_ckpt50000/hijack_hijack_rate_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
454
+ outputs/plots_V256_B16_LR3e-2_MI50000_E64_H1_L2_ds1337_is1337_ckpt50000/hijack_hijack_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
455
+ outputs/plots_V256_B16_LR3e-2_MI50000_E64_H1_L2_ds1337_is1337_ckpt50000/hijack_sample_count_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
456
+ outputs/plots_V256_B16_LR3e-2_MI50000_E64_H1_L2_ds1337_is1337_ckpt50000/intensity_layer0_asym_ub60_lb60.png filter=lfs diff=lfs merge=lfs -text
457
+ outputs/plots_V256_B16_LR3e-2_MI50000_E64_H1_L2_ds1337_is1337_ckpt50000/intervention_pernumber_random_layer0.png filter=lfs diff=lfs merge=lfs -text
458
+ outputs/plots_V256_B16_LR3e-2_MI50000_E64_H1_L2_ds1337_is1337_ckpt50000/intervention_pernumber_separator_layer0.png filter=lfs diff=lfs merge=lfs -text
459
+ outputs/plots_V256_B16_LR3e-2_MI550000_E64_H1_L2_ds1337_is1337_ckpt550000/ablation_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
460
+ outputs/plots_V256_B16_LR3e-2_MI550000_E64_H1_L2_ds1337_is1337_ckpt550000/ablation_per_position.png filter=lfs diff=lfs merge=lfs -text
461
+ outputs/plots_V256_B16_LR3e-2_MI550000_E64_H1_L2_ds1337_is1337_ckpt550000/attn_heatmaps.png filter=lfs diff=lfs merge=lfs -text
462
+ outputs/plots_V256_B16_LR3e-2_MI550000_E64_H1_L2_ds1337_is1337_ckpt550000/avg_attn_by_number_layer0.png filter=lfs diff=lfs merge=lfs -text
463
+ outputs/plots_V256_B16_LR3e-2_MI550000_E64_H1_L2_ds1337_is1337_ckpt550000/avg_attn_by_number_layer1.png filter=lfs diff=lfs merge=lfs -text
464
+ outputs/plots_V256_B16_LR3e-2_MI550000_E64_H1_L2_ds1337_is1337_ckpt550000/baseline_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
465
+ outputs/plots_V256_B16_LR3e-2_MI550000_E64_H1_L2_ds1337_is1337_ckpt550000/cinclogits_layer0.png filter=lfs diff=lfs merge=lfs -text
466
+ outputs/plots_V256_B16_LR3e-2_MI550000_E64_H1_L2_ds1337_is1337_ckpt550000/cinclogits_layer1.png filter=lfs diff=lfs merge=lfs -text
467
+ outputs/plots_V256_B16_LR3e-2_MI550000_E64_H1_L2_ds1337_is1337_ckpt550000/hijack_breaking_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
468
+ outputs/plots_V256_B16_LR3e-2_MI550000_E64_H1_L2_ds1337_is1337_ckpt550000/hijack_breaking_rate_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
469
+ outputs/plots_V256_B16_LR3e-2_MI550000_E64_H1_L2_ds1337_is1337_ckpt550000/hijack_breaking_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
470
+ outputs/plots_V256_B16_LR3e-2_MI550000_E64_H1_L2_ds1337_is1337_ckpt550000/hijack_hijack_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
471
+ outputs/plots_V256_B16_LR3e-2_MI550000_E64_H1_L2_ds1337_is1337_ckpt550000/hijack_hijack_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
472
+ outputs/plots_V256_B16_LR3e-2_MI550000_E64_H1_L2_ds1337_is1337_ckpt550000/hijack_sample_count_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
473
+ outputs/plots_V256_B16_LR3e-2_MI550000_E64_H1_L2_ds1337_is1337_ckpt550000/intensity_layer0_asym_ub60_lb60.png filter=lfs diff=lfs merge=lfs -text
474
+ outputs/plots_V256_B16_LR3e-2_MI550000_E64_H1_L2_ds1337_is1337_ckpt550000/intervention_pernumber_random_layer0.png filter=lfs diff=lfs merge=lfs -text
475
+ outputs/plots_V256_B16_LR3e-2_MI550000_E64_H1_L2_ds1337_is1337_ckpt550000/intervention_pernumber_separator_layer0.png filter=lfs diff=lfs merge=lfs -text
476
+ outputs/plots_V256_B16_LR3e-2_MI600000_E64_H1_L2_ds1337_is1337_ckpt600000/ablation_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
477
+ outputs/plots_V256_B16_LR3e-2_MI600000_E64_H1_L2_ds1337_is1337_ckpt600000/ablation_per_position.png filter=lfs diff=lfs merge=lfs -text
478
+ outputs/plots_V256_B16_LR3e-2_MI600000_E64_H1_L2_ds1337_is1337_ckpt600000/attn_heatmaps.png filter=lfs diff=lfs merge=lfs -text
479
+ outputs/plots_V256_B16_LR3e-2_MI600000_E64_H1_L2_ds1337_is1337_ckpt600000/avg_attn_by_number_layer0.png filter=lfs diff=lfs merge=lfs -text
480
+ outputs/plots_V256_B16_LR3e-2_MI600000_E64_H1_L2_ds1337_is1337_ckpt600000/avg_attn_by_number_layer1.png filter=lfs diff=lfs merge=lfs -text
481
+ outputs/plots_V256_B16_LR3e-2_MI600000_E64_H1_L2_ds1337_is1337_ckpt600000/baseline_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
482
+ outputs/plots_V256_B16_LR3e-2_MI600000_E64_H1_L2_ds1337_is1337_ckpt600000/cinclogits_layer0.png filter=lfs diff=lfs merge=lfs -text
483
+ outputs/plots_V256_B16_LR3e-2_MI600000_E64_H1_L2_ds1337_is1337_ckpt600000/cinclogits_layer1.png filter=lfs diff=lfs merge=lfs -text
484
+ outputs/plots_V256_B16_LR3e-2_MI600000_E64_H1_L2_ds1337_is1337_ckpt600000/hijack_breaking_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
485
+ outputs/plots_V256_B16_LR3e-2_MI600000_E64_H1_L2_ds1337_is1337_ckpt600000/hijack_breaking_rate_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
486
+ outputs/plots_V256_B16_LR3e-2_MI600000_E64_H1_L2_ds1337_is1337_ckpt600000/hijack_breaking_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
487
+ outputs/plots_V256_B16_LR3e-2_MI600000_E64_H1_L2_ds1337_is1337_ckpt600000/hijack_hijack_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
488
+ outputs/plots_V256_B16_LR3e-2_MI600000_E64_H1_L2_ds1337_is1337_ckpt600000/hijack_hijack_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
489
+ outputs/plots_V256_B16_LR3e-2_MI600000_E64_H1_L2_ds1337_is1337_ckpt600000/intensity_layer0_asym_ub60_lb60.png filter=lfs diff=lfs merge=lfs -text
490
+ outputs/plots_V256_B16_LR3e-2_MI600000_E64_H1_L2_ds1337_is1337_ckpt600000/intervention_pernumber_random_layer0.png filter=lfs diff=lfs merge=lfs -text
491
+ outputs/plots_V256_B16_LR3e-2_MI600000_E64_H1_L2_ds1337_is1337_ckpt600000/intervention_pernumber_separator_layer0.png filter=lfs diff=lfs merge=lfs -text
492
+ outputs/plots_V256_B16_LR3e-2_MI650000_E64_H1_L2_ds1337_is1337_ckpt650000/ablation_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
493
+ outputs/plots_V256_B16_LR3e-2_MI650000_E64_H1_L2_ds1337_is1337_ckpt650000/ablation_per_position.png filter=lfs diff=lfs merge=lfs -text
494
+ outputs/plots_V256_B16_LR3e-2_MI650000_E64_H1_L2_ds1337_is1337_ckpt650000/attn_heatmaps.png filter=lfs diff=lfs merge=lfs -text
495
+ outputs/plots_V256_B16_LR3e-2_MI650000_E64_H1_L2_ds1337_is1337_ckpt650000/avg_attn_by_number_layer0.png filter=lfs diff=lfs merge=lfs -text
496
+ outputs/plots_V256_B16_LR3e-2_MI650000_E64_H1_L2_ds1337_is1337_ckpt650000/avg_attn_by_number_layer1.png filter=lfs diff=lfs merge=lfs -text
497
+ outputs/plots_V256_B16_LR3e-2_MI650000_E64_H1_L2_ds1337_is1337_ckpt650000/baseline_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
498
+ outputs/plots_V256_B16_LR3e-2_MI650000_E64_H1_L2_ds1337_is1337_ckpt650000/cinclogits_layer0.png filter=lfs diff=lfs merge=lfs -text
499
+ outputs/plots_V256_B16_LR3e-2_MI650000_E64_H1_L2_ds1337_is1337_ckpt650000/cinclogits_layer1.png filter=lfs diff=lfs merge=lfs -text
500
+ outputs/plots_V256_B16_LR3e-2_MI650000_E64_H1_L2_ds1337_is1337_ckpt650000/hijack_breaking_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
501
+ outputs/plots_V256_B16_LR3e-2_MI650000_E64_H1_L2_ds1337_is1337_ckpt650000/hijack_breaking_rate_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
502
+ outputs/plots_V256_B16_LR3e-2_MI650000_E64_H1_L2_ds1337_is1337_ckpt650000/hijack_breaking_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
503
+ outputs/plots_V256_B16_LR3e-2_MI650000_E64_H1_L2_ds1337_is1337_ckpt650000/hijack_hijack_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
504
+ outputs/plots_V256_B16_LR3e-2_MI650000_E64_H1_L2_ds1337_is1337_ckpt650000/hijack_hijack_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
505
+ outputs/plots_V256_B16_LR3e-2_MI650000_E64_H1_L2_ds1337_is1337_ckpt650000/hijack_sample_count_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
506
+ outputs/plots_V256_B16_LR3e-2_MI650000_E64_H1_L2_ds1337_is1337_ckpt650000/intensity_layer0_asym_ub60_lb60.png filter=lfs diff=lfs merge=lfs -text
507
+ outputs/plots_V256_B16_LR3e-2_MI650000_E64_H1_L2_ds1337_is1337_ckpt650000/intervention_pernumber_random_layer0.png filter=lfs diff=lfs merge=lfs -text
508
+ outputs/plots_V256_B16_LR3e-2_MI650000_E64_H1_L2_ds1337_is1337_ckpt650000/intervention_pernumber_separator_layer0.png filter=lfs diff=lfs merge=lfs -text
509
+ outputs/plots_V256_B16_LR3e-2_MI700000_E64_H1_L2_ds1337_is1337_ckpt700000/ablation_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
510
+ outputs/plots_V256_B16_LR3e-2_MI700000_E64_H1_L2_ds1337_is1337_ckpt700000/ablation_per_position.png filter=lfs diff=lfs merge=lfs -text
511
+ outputs/plots_V256_B16_LR3e-2_MI700000_E64_H1_L2_ds1337_is1337_ckpt700000/attn_heatmaps.png filter=lfs diff=lfs merge=lfs -text
512
+ outputs/plots_V256_B16_LR3e-2_MI700000_E64_H1_L2_ds1337_is1337_ckpt700000/avg_attn_by_number_layer0.png filter=lfs diff=lfs merge=lfs -text
513
+ outputs/plots_V256_B16_LR3e-2_MI700000_E64_H1_L2_ds1337_is1337_ckpt700000/avg_attn_by_number_layer1.png filter=lfs diff=lfs merge=lfs -text
514
+ outputs/plots_V256_B16_LR3e-2_MI700000_E64_H1_L2_ds1337_is1337_ckpt700000/baseline_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
515
+ outputs/plots_V256_B16_LR3e-2_MI700000_E64_H1_L2_ds1337_is1337_ckpt700000/cinclogits_layer0.png filter=lfs diff=lfs merge=lfs -text
516
+ outputs/plots_V256_B16_LR3e-2_MI700000_E64_H1_L2_ds1337_is1337_ckpt700000/cinclogits_layer1.png filter=lfs diff=lfs merge=lfs -text
517
+ outputs/plots_V256_B16_LR3e-2_MI700000_E64_H1_L2_ds1337_is1337_ckpt700000/hijack_breaking_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
518
+ outputs/plots_V256_B16_LR3e-2_MI700000_E64_H1_L2_ds1337_is1337_ckpt700000/hijack_breaking_rate_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
519
+ outputs/plots_V256_B16_LR3e-2_MI700000_E64_H1_L2_ds1337_is1337_ckpt700000/hijack_breaking_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
520
+ outputs/plots_V256_B16_LR3e-2_MI700000_E64_H1_L2_ds1337_is1337_ckpt700000/hijack_hijack_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
521
+ outputs/plots_V256_B16_LR3e-2_MI700000_E64_H1_L2_ds1337_is1337_ckpt700000/hijack_hijack_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
522
+ outputs/plots_V256_B16_LR3e-2_MI700000_E64_H1_L2_ds1337_is1337_ckpt700000/intensity_layer0_asym_ub60_lb60.png filter=lfs diff=lfs merge=lfs -text
523
+ outputs/plots_V256_B16_LR3e-2_MI700000_E64_H1_L2_ds1337_is1337_ckpt700000/intervention_pernumber_random_layer0.png filter=lfs diff=lfs merge=lfs -text
524
+ outputs/plots_V256_B16_LR3e-2_MI700000_E64_H1_L2_ds1337_is1337_ckpt700000/intervention_pernumber_separator_layer0.png filter=lfs diff=lfs merge=lfs -text
525
+ outputs/plots_V256_B16_LR3e-2_MI750000_E64_H1_L2_ds1337_is1337_ckpt750000/ablation_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
526
+ outputs/plots_V256_B16_LR3e-2_MI750000_E64_H1_L2_ds1337_is1337_ckpt750000/ablation_per_position.png filter=lfs diff=lfs merge=lfs -text
527
+ outputs/plots_V256_B16_LR3e-2_MI750000_E64_H1_L2_ds1337_is1337_ckpt750000/attn_heatmaps.png filter=lfs diff=lfs merge=lfs -text
528
+ outputs/plots_V256_B16_LR3e-2_MI750000_E64_H1_L2_ds1337_is1337_ckpt750000/avg_attn_by_number_layer0.png filter=lfs diff=lfs merge=lfs -text
529
+ outputs/plots_V256_B16_LR3e-2_MI750000_E64_H1_L2_ds1337_is1337_ckpt750000/avg_attn_by_number_layer1.png filter=lfs diff=lfs merge=lfs -text
530
+ outputs/plots_V256_B16_LR3e-2_MI750000_E64_H1_L2_ds1337_is1337_ckpt750000/baseline_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
531
+ outputs/plots_V256_B16_LR3e-2_MI750000_E64_H1_L2_ds1337_is1337_ckpt750000/cinclogits_layer0.png filter=lfs diff=lfs merge=lfs -text
532
+ outputs/plots_V256_B16_LR3e-2_MI750000_E64_H1_L2_ds1337_is1337_ckpt750000/cinclogits_layer1.png filter=lfs diff=lfs merge=lfs -text
533
+ outputs/plots_V256_B16_LR3e-2_MI750000_E64_H1_L2_ds1337_is1337_ckpt750000/hijack_breaking_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
534
+ outputs/plots_V256_B16_LR3e-2_MI750000_E64_H1_L2_ds1337_is1337_ckpt750000/hijack_breaking_rate_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
535
+ outputs/plots_V256_B16_LR3e-2_MI750000_E64_H1_L2_ds1337_is1337_ckpt750000/hijack_breaking_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
536
+ outputs/plots_V256_B16_LR3e-2_MI750000_E64_H1_L2_ds1337_is1337_ckpt750000/hijack_hijack_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
537
+ outputs/plots_V256_B16_LR3e-2_MI750000_E64_H1_L2_ds1337_is1337_ckpt750000/hijack_hijack_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
538
+ outputs/plots_V256_B16_LR3e-2_MI750000_E64_H1_L2_ds1337_is1337_ckpt750000/hijack_sample_count_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
539
+ outputs/plots_V256_B16_LR3e-2_MI750000_E64_H1_L2_ds1337_is1337_ckpt750000/intensity_layer0_asym_ub60_lb60.png filter=lfs diff=lfs merge=lfs -text
540
+ outputs/plots_V256_B16_LR3e-2_MI750000_E64_H1_L2_ds1337_is1337_ckpt750000/intervention_pernumber_random_layer0.png filter=lfs diff=lfs merge=lfs -text
541
+ outputs/plots_V256_B16_LR3e-2_MI750000_E64_H1_L2_ds1337_is1337_ckpt750000/intervention_pernumber_separator_layer0.png filter=lfs diff=lfs merge=lfs -text
542
+ outputs/plots_V256_B16_LR3e-2_MI800000_E64_H1_L2_ds1337_is1337_ckpt800000/ablation_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
543
+ outputs/plots_V256_B16_LR3e-2_MI800000_E64_H1_L2_ds1337_is1337_ckpt800000/ablation_per_position.png filter=lfs diff=lfs merge=lfs -text
544
+ outputs/plots_V256_B16_LR3e-2_MI800000_E64_H1_L2_ds1337_is1337_ckpt800000/attn_heatmaps.png filter=lfs diff=lfs merge=lfs -text
545
+ outputs/plots_V256_B16_LR3e-2_MI800000_E64_H1_L2_ds1337_is1337_ckpt800000/avg_attn_by_number_layer0.png filter=lfs diff=lfs merge=lfs -text
546
+ outputs/plots_V256_B16_LR3e-2_MI800000_E64_H1_L2_ds1337_is1337_ckpt800000/avg_attn_by_number_layer1.png filter=lfs diff=lfs merge=lfs -text
547
+ outputs/plots_V256_B16_LR3e-2_MI800000_E64_H1_L2_ds1337_is1337_ckpt800000/baseline_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
548
+ outputs/plots_V256_B16_LR3e-2_MI800000_E64_H1_L2_ds1337_is1337_ckpt800000/cinclogits_layer0.png filter=lfs diff=lfs merge=lfs -text
549
+ outputs/plots_V256_B16_LR3e-2_MI800000_E64_H1_L2_ds1337_is1337_ckpt800000/cinclogits_layer1.png filter=lfs diff=lfs merge=lfs -text
550
+ outputs/plots_V256_B16_LR3e-2_MI800000_E64_H1_L2_ds1337_is1337_ckpt800000/hijack_breaking_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
551
+ outputs/plots_V256_B16_LR3e-2_MI800000_E64_H1_L2_ds1337_is1337_ckpt800000/hijack_breaking_rate_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
552
+ outputs/plots_V256_B16_LR3e-2_MI800000_E64_H1_L2_ds1337_is1337_ckpt800000/hijack_breaking_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
553
+ outputs/plots_V256_B16_LR3e-2_MI800000_E64_H1_L2_ds1337_is1337_ckpt800000/hijack_hijack_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
554
+ outputs/plots_V256_B16_LR3e-2_MI800000_E64_H1_L2_ds1337_is1337_ckpt800000/hijack_hijack_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
555
+ outputs/plots_V256_B16_LR3e-2_MI800000_E64_H1_L2_ds1337_is1337_ckpt800000/hijack_sample_count_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
556
+ outputs/plots_V256_B16_LR3e-2_MI800000_E64_H1_L2_ds1337_is1337_ckpt800000/intensity_layer0_asym_ub60_lb60.png filter=lfs diff=lfs merge=lfs -text
557
+ outputs/plots_V256_B16_LR3e-2_MI800000_E64_H1_L2_ds1337_is1337_ckpt800000/intervention_pernumber_random_layer0.png filter=lfs diff=lfs merge=lfs -text
558
+ outputs/plots_V256_B16_LR3e-2_MI800000_E64_H1_L2_ds1337_is1337_ckpt800000/intervention_pernumber_separator_layer0.png filter=lfs diff=lfs merge=lfs -text
559
+ outputs/plots_V256_B16_LR3e-2_MI850000_E64_H1_L2_ds1337_is1337_ckpt850000/ablation_accuracy.png filter=lfs diff=lfs merge=lfs -text
560
+ outputs/plots_V256_B16_LR3e-2_MI850000_E64_H1_L2_ds1337_is1337_ckpt850000/ablation_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
561
+ outputs/plots_V256_B16_LR3e-2_MI850000_E64_H1_L2_ds1337_is1337_ckpt850000/ablation_per_position.png filter=lfs diff=lfs merge=lfs -text
562
+ outputs/plots_V256_B16_LR3e-2_MI850000_E64_H1_L2_ds1337_is1337_ckpt850000/attn_heatmaps.png filter=lfs diff=lfs merge=lfs -text
563
+ outputs/plots_V256_B16_LR3e-2_MI850000_E64_H1_L2_ds1337_is1337_ckpt850000/avg_attn_by_number_layer0.png filter=lfs diff=lfs merge=lfs -text
564
+ outputs/plots_V256_B16_LR3e-2_MI850000_E64_H1_L2_ds1337_is1337_ckpt850000/avg_attn_by_number_layer1.png filter=lfs diff=lfs merge=lfs -text
565
+ outputs/plots_V256_B16_LR3e-2_MI850000_E64_H1_L2_ds1337_is1337_ckpt850000/baseline_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
566
+ outputs/plots_V256_B16_LR3e-2_MI850000_E64_H1_L2_ds1337_is1337_ckpt850000/cinclogits_layer0.png filter=lfs diff=lfs merge=lfs -text
567
+ outputs/plots_V256_B16_LR3e-2_MI850000_E64_H1_L2_ds1337_is1337_ckpt850000/cinclogits_layer1.png filter=lfs diff=lfs merge=lfs -text
568
+ outputs/plots_V256_B16_LR3e-2_MI850000_E64_H1_L2_ds1337_is1337_ckpt850000/hijack_breaking_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
569
+ outputs/plots_V256_B16_LR3e-2_MI850000_E64_H1_L2_ds1337_is1337_ckpt850000/hijack_breaking_rate_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
570
+ outputs/plots_V256_B16_LR3e-2_MI850000_E64_H1_L2_ds1337_is1337_ckpt850000/hijack_breaking_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
571
+ outputs/plots_V256_B16_LR3e-2_MI850000_E64_H1_L2_ds1337_is1337_ckpt850000/hijack_hijack_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
572
+ outputs/plots_V256_B16_LR3e-2_MI850000_E64_H1_L2_ds1337_is1337_ckpt850000/hijack_hijack_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
573
+ outputs/plots_V256_B16_LR3e-2_MI850000_E64_H1_L2_ds1337_is1337_ckpt850000/intensity_layer0_asym_ub60_lb60.png filter=lfs diff=lfs merge=lfs -text
574
+ outputs/plots_V256_B16_LR3e-2_MI850000_E64_H1_L2_ds1337_is1337_ckpt850000/intervention_pernumber_random_layer0.png filter=lfs diff=lfs merge=lfs -text
575
+ outputs/plots_V256_B16_LR3e-2_MI850000_E64_H1_L2_ds1337_is1337_ckpt850000/intervention_pernumber_separator_layer0.png filter=lfs diff=lfs merge=lfs -text
576
+ outputs/plots_V256_B16_LR3e-2_MI900000_E64_H1_L2_ds1337_is1337_ckpt900000/ablation_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
577
+ outputs/plots_V256_B16_LR3e-2_MI900000_E64_H1_L2_ds1337_is1337_ckpt900000/ablation_per_position.png filter=lfs diff=lfs merge=lfs -text
578
+ outputs/plots_V256_B16_LR3e-2_MI900000_E64_H1_L2_ds1337_is1337_ckpt900000/attn_heatmaps.png filter=lfs diff=lfs merge=lfs -text
579
+ outputs/plots_V256_B16_LR3e-2_MI900000_E64_H1_L2_ds1337_is1337_ckpt900000/avg_attn_by_number_layer0.png filter=lfs diff=lfs merge=lfs -text
580
+ outputs/plots_V256_B16_LR3e-2_MI900000_E64_H1_L2_ds1337_is1337_ckpt900000/avg_attn_by_number_layer1.png filter=lfs diff=lfs merge=lfs -text
581
+ outputs/plots_V256_B16_LR3e-2_MI900000_E64_H1_L2_ds1337_is1337_ckpt900000/baseline_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
582
+ outputs/plots_V256_B16_LR3e-2_MI900000_E64_H1_L2_ds1337_is1337_ckpt900000/cinclogits_layer0.png filter=lfs diff=lfs merge=lfs -text
583
+ outputs/plots_V256_B16_LR3e-2_MI900000_E64_H1_L2_ds1337_is1337_ckpt900000/cinclogits_layer1.png filter=lfs diff=lfs merge=lfs -text
584
+ outputs/plots_V256_B16_LR3e-2_MI900000_E64_H1_L2_ds1337_is1337_ckpt900000/hijack_breaking_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
585
+ outputs/plots_V256_B16_LR3e-2_MI900000_E64_H1_L2_ds1337_is1337_ckpt900000/hijack_breaking_rate_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
586
+ outputs/plots_V256_B16_LR3e-2_MI900000_E64_H1_L2_ds1337_is1337_ckpt900000/hijack_breaking_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
587
+ outputs/plots_V256_B16_LR3e-2_MI900000_E64_H1_L2_ds1337_is1337_ckpt900000/hijack_hijack_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
588
+ outputs/plots_V256_B16_LR3e-2_MI900000_E64_H1_L2_ds1337_is1337_ckpt900000/hijack_hijack_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
589
+ outputs/plots_V256_B16_LR3e-2_MI900000_E64_H1_L2_ds1337_is1337_ckpt900000/intensity_layer0_asym_ub60_lb60.png filter=lfs diff=lfs merge=lfs -text
590
+ outputs/plots_V256_B16_LR3e-2_MI900000_E64_H1_L2_ds1337_is1337_ckpt900000/intervention_pernumber_random_layer0.png filter=lfs diff=lfs merge=lfs -text
591
+ outputs/plots_V256_B16_LR3e-2_MI900000_E64_H1_L2_ds1337_is1337_ckpt900000/intervention_pernumber_separator_layer0.png filter=lfs diff=lfs merge=lfs -text
592
+ outputs/plots_V256_B16_LR3e-2_MI950000_E64_H1_L2_ds1337_is1337_ckpt950000/ablation_accuracy.png filter=lfs diff=lfs merge=lfs -text
593
+ outputs/plots_V256_B16_LR3e-2_MI950000_E64_H1_L2_ds1337_is1337_ckpt950000/ablation_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
594
+ outputs/plots_V256_B16_LR3e-2_MI950000_E64_H1_L2_ds1337_is1337_ckpt950000/ablation_per_position.png filter=lfs diff=lfs merge=lfs -text
595
+ outputs/plots_V256_B16_LR3e-2_MI950000_E64_H1_L2_ds1337_is1337_ckpt950000/attn_heatmaps.png filter=lfs diff=lfs merge=lfs -text
596
+ outputs/plots_V256_B16_LR3e-2_MI950000_E64_H1_L2_ds1337_is1337_ckpt950000/avg_attn_by_number_layer0.png filter=lfs diff=lfs merge=lfs -text
597
+ outputs/plots_V256_B16_LR3e-2_MI950000_E64_H1_L2_ds1337_is1337_ckpt950000/avg_attn_by_number_layer1.png filter=lfs diff=lfs merge=lfs -text
598
+ outputs/plots_V256_B16_LR3e-2_MI950000_E64_H1_L2_ds1337_is1337_ckpt950000/baseline_conditional_accuracy.png filter=lfs diff=lfs merge=lfs -text
599
+ outputs/plots_V256_B16_LR3e-2_MI950000_E64_H1_L2_ds1337_is1337_ckpt950000/cinclogits_layer0.png filter=lfs diff=lfs merge=lfs -text
600
+ outputs/plots_V256_B16_LR3e-2_MI950000_E64_H1_L2_ds1337_is1337_ckpt950000/cinclogits_layer1.png filter=lfs diff=lfs merge=lfs -text
601
+ outputs/plots_V256_B16_LR3e-2_MI950000_E64_H1_L2_ds1337_is1337_ckpt950000/hijack_breaking_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
602
+ outputs/plots_V256_B16_LR3e-2_MI950000_E64_H1_L2_ds1337_is1337_ckpt950000/hijack_breaking_rate_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
603
+ outputs/plots_V256_B16_LR3e-2_MI950000_E64_H1_L2_ds1337_is1337_ckpt950000/hijack_breaking_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
604
+ outputs/plots_V256_B16_LR3e-2_MI950000_E64_H1_L2_ds1337_is1337_ckpt950000/hijack_hijack_rate_bynext_heatmap_layer0.png filter=lfs diff=lfs merge=lfs -text
605
+ outputs/plots_V256_B16_LR3e-2_MI950000_E64_H1_L2_ds1337_is1337_ckpt950000/hijack_hijack_rate_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
606
+ outputs/plots_V256_B16_LR3e-2_MI950000_E64_H1_L2_ds1337_is1337_ckpt950000/hijack_sample_count_heatmap_layer1.png filter=lfs diff=lfs merge=lfs -text
607
+ outputs/plots_V256_B16_LR3e-2_MI950000_E64_H1_L2_ds1337_is1337_ckpt950000/intensity_layer0_asym_ub60_lb60.png filter=lfs diff=lfs merge=lfs -text
608
+ outputs/plots_V256_B16_LR3e-2_MI950000_E64_H1_L2_ds1337_is1337_ckpt950000/intervention_pernumber_random_layer0.png filter=lfs diff=lfs merge=lfs -text
609
+ outputs/plots_V256_B16_LR3e-2_MI950000_E64_H1_L2_ds1337_is1337_ckpt950000/intervention_pernumber_separator_layer0.png filter=lfs diff=lfs merge=lfs -text
__pycache__/model_tbyt_train.cpython-312.pyc ADDED
Binary file (9.79 kB). View file
 
attn_by_number_worker.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Worker: compute average attention-by-number-value heatmap (256×256)
4
+ for both attention layers of each assigned checkpoint.
5
+
6
+ For each token position with value i, we accumulate its attention weights
7
+ to all visible positions with value j, then normalize by the total count
8
+ of from-positions with value i. The result: avg_matrix[i,j] ≈ fraction
9
+ of attention that number i pays to number j (rows sum to ~1).
10
+ """
11
+ import argparse, json, os, sys, time, types
12
+ import numpy as np
13
+ import torch
14
+ import torch.nn.functional as F
15
+ import matplotlib
16
+ matplotlib.use('Agg')
17
+ import matplotlib.pyplot as plt
18
+
19
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'grid-run'))
20
+ from model_analysis import GPT, GPTConfig
21
+
22
+ VOCAB_SIZE = 256
23
+ BLOCK_SIZE = 16
24
+ SEQ_LEN = 2 * BLOCK_SIZE + 1 # 33
25
+ N_LAYERS = 2
26
+ BATCH_SIZE = 1024
27
+ N_BATCHES = 100 # 102 400 sequences total
28
+
29
+
30
+ def remap_state_dict(sd):
31
+ new_sd = {}
32
+ for key, val in sd.items():
33
+ new_key = key
34
+ for i in range(10):
35
+ new_key = new_key.replace(f'transformer.h.{i}.attn.', f'transformer.h.{i}.c_attn.')
36
+ new_key = new_key.replace(f'transformer.h.{i}.mlp.', f'transformer.h.{i}.c_fc.')
37
+ new_sd[new_key] = val
38
+ return new_sd
39
+
40
+
41
+ def load_model(ckpt_path, device):
42
+ ckpt = torch.load(ckpt_path, map_location='cpu', weights_only=False)
43
+ mc = ckpt['model_config']
44
+ vocab_size = mc['vocab_size'] - 1
45
+ block_size = mc['block_size']
46
+ config = GPTConfig(block_size=block_size, vocab_size=vocab_size,
47
+ with_layer_norm=mc.get('use_final_LN', True))
48
+ model = GPT(config)
49
+ sd = remap_state_dict(ckpt['model_state_dict'])
50
+ wpe_max = block_size * 4 + 1
51
+ if 'transformer.wpe.weight' in sd and sd['transformer.wpe.weight'].shape[0] > wpe_max:
52
+ sd['transformer.wpe.weight'] = sd['transformer.wpe.weight'][:wpe_max]
53
+ for k in [k for k in sd if k.endswith('.c_attn.bias') and 'c_attn.c_attn' not in k]:
54
+ del sd[k]
55
+ sd.pop('lm_head.weight', None)
56
+ model.load_state_dict(sd, strict=False)
57
+ model.to(device).eval()
58
+ return model, config
59
+
60
+
61
+ def patch_attention(model):
62
+ """Replace forward so it stores batched attention weights (B, 1, T, T)."""
63
+ for layer_idx in range(N_LAYERS):
64
+ attn_mod = model.transformer.h[layer_idx].c_attn
65
+
66
+ def _make():
67
+ def fwd(self_attn, x, layer_n=-1):
68
+ B, T, C = x.size()
69
+ qkv = self_attn.c_attn(x)
70
+ q, k, v = qkv.split(self_attn.n_embd, dim=2)
71
+ q = q.view(B, T, self_attn.n_heads, C // self_attn.n_heads).transpose(1, 2)
72
+ k = k.view(B, T, self_attn.n_heads, C // self_attn.n_heads).transpose(1, 2)
73
+ v = v.view(B, T, self_attn.n_heads, C // self_attn.n_heads).transpose(1, 2)
74
+ a = q @ k.transpose(-1, -2) * 0.1 / k.size(-1) ** 0.5
75
+ a = a.masked_fill(self_attn.bias[:, :, :T, :T] == 0, float('-inf'))
76
+ a = F.softmax(a, dim=-1)
77
+ self_attn.batched_attn = a
78
+ y = (a @ v).transpose(1, 2).contiguous().view(B, T, C)
79
+ return self_attn.c_proj(y)
80
+ return fwd
81
+
82
+ attn_mod.forward = types.MethodType(_make(), attn_mod)
83
+
84
+
85
+ def get_batch(device):
86
+ ids = torch.rand(BATCH_SIZE, VOCAB_SIZE, device=device).argsort(dim=1)[:, :BLOCK_SIZE]
87
+ sorted_ids, _ = ids.sort(dim=1)
88
+ sep = torch.full((BATCH_SIZE, 1), VOCAB_SIZE, dtype=torch.long, device=device)
89
+ return torch.cat([ids, sep, sorted_ids], dim=1)
90
+
91
+
92
+ @torch.no_grad()
93
+ def compute(model, device):
94
+ T = SEQ_LEN
95
+ VS = VOCAB_SIZE
96
+ causal = torch.tril(torch.ones(T, T, device=device, dtype=torch.bool))
97
+
98
+ sum_mat = [torch.zeros(VS * VS, device=device, dtype=torch.float64) for _ in range(N_LAYERS)]
99
+ from_cnt = torch.zeros(VS, device=device, dtype=torch.float64)
100
+
101
+ for _ in range(N_BATCHES):
102
+ tokens = get_batch(device)
103
+ model(tokens)
104
+
105
+ from_v = tokens.unsqueeze(2).expand(-1, -1, T)
106
+ to_v = tokens.unsqueeze(1).expand(-1, T, -1)
107
+ valid = causal.unsqueeze(0) & (from_v < VS) & (to_v < VS)
108
+
109
+ flat_idx = (from_v * VS + to_v).long()
110
+ idx_v = flat_idx[valid]
111
+
112
+ for layer in range(N_LAYERS):
113
+ attn = model.transformer.h[layer].c_attn.batched_attn[:, 0]
114
+ sum_mat[layer].scatter_add_(0, idx_v, attn[valid].double())
115
+
116
+ tok_valid = tokens[tokens < VS]
117
+ from_cnt.scatter_add_(0, tok_valid.long(),
118
+ torch.ones(tok_valid.numel(), device=device, dtype=torch.float64))
119
+
120
+ results = []
121
+ fc = from_cnt.clamp(min=1).unsqueeze(1)
122
+ for layer in range(N_LAYERS):
123
+ avg = (sum_mat[layer].view(VS, VS) / fc).cpu().numpy()
124
+ results.append(avg)
125
+ return results, from_cnt.cpu().numpy()
126
+
127
+
128
+ def plot_heatmap(avg, layer, out_dir, ckpt_label):
129
+ fig, ax = plt.subplots(figsize=(10, 9))
130
+ pos_vals = avg[avg > 0]
131
+ vmax = np.percentile(pos_vals, 99) if pos_vals.size > 0 else 1.0
132
+ im = ax.imshow(avg, aspect='auto', origin='lower', cmap='inferno',
133
+ vmin=0, vmax=vmax, interpolation='nearest')
134
+ ax.set_xlabel('To number (attended-to)', fontsize=12)
135
+ ax.set_ylabel('From number (attending)', fontsize=12)
136
+ ax.set_title(f'Layer {layer+1}: avg attention (number → number)\n{ckpt_label}',
137
+ fontsize=11)
138
+ ticks = list(range(0, 256, 32)) + [255]
139
+ ax.set_xticks(ticks)
140
+ ax.set_yticks(ticks)
141
+ fig.colorbar(im, ax=ax, shrink=0.82, label='Avg attention weight')
142
+ plt.tight_layout()
143
+ path = os.path.join(out_dir, f'avg_attn_by_number_layer{layer}.png')
144
+ fig.savefig(path, dpi=150, bbox_inches='tight')
145
+ plt.close(fig)
146
+ return path
147
+
148
+
149
+ def main():
150
+ ap = argparse.ArgumentParser()
151
+ ap.add_argument('--tasks-file', required=True)
152
+ ap.add_argument('--gpu', type=int, required=True)
153
+ args = ap.parse_args()
154
+
155
+ os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)
156
+ device = 'cuda'
157
+
158
+ with open(args.tasks_file) as f:
159
+ tasks = json.load(f)
160
+
161
+ print(f"GPU {args.gpu}: {len(tasks)} checkpoints", flush=True)
162
+
163
+ for task in tasks:
164
+ ckpt_path = task['ckpt_path']
165
+ out_dir = task['out_dir']
166
+ label = os.path.basename(ckpt_path).replace('.pt', '')
167
+
168
+ done0 = os.path.exists(os.path.join(out_dir, 'avg_attn_by_number_layer0.png'))
169
+ done1 = os.path.exists(os.path.join(out_dir, 'avg_attn_by_number_layer1.png'))
170
+ if done0 and done1:
171
+ print(f" Skip (exists): {label}", flush=True)
172
+ continue
173
+
174
+ t0 = time.time()
175
+ model, _ = load_model(ckpt_path, device)
176
+ patch_attention(model)
177
+ avgs, from_cnt = compute(model, device)
178
+
179
+ os.makedirs(out_dir, exist_ok=True)
180
+ for layer in range(N_LAYERS):
181
+ np.savez(os.path.join(out_dir, f'avg_attn_by_number_layer{layer}.npz'),
182
+ avg_attn=avgs[layer], from_count=from_cnt)
183
+ plot_heatmap(avgs[layer], layer, out_dir, label)
184
+
185
+ dt = time.time() - t0
186
+ print(f" Done: {label} ({dt:.1f}s)", flush=True)
187
+ del model
188
+ torch.cuda.empty_cache()
189
+
190
+ print(f"GPU {args.gpu}: all done.", flush=True)
191
+
192
+
193
+ if __name__ == '__main__':
194
+ main()
gpu_worker.py CHANGED
@@ -1,23 +1,25 @@
1
  """
2
- GPU worker: processes a batch of analysis tasks on a single GPU.
3
- Model is loaded once per checkpoint and reused for all tasks on that checkpoint.
4
- Prints JSON status lines so the launcher can track progress.
5
  """
6
  import argparse
7
  import json
8
  import os
9
  import sys
10
  import time
 
11
  import numpy as np
12
  import torch
 
13
 
14
  sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'grid-run'))
15
  from model_analysis import GPT, GPTConfig, GPTIntervention
16
 
17
 
18
- def remap_state_dict(sd_100k):
19
  new_sd = {}
20
- for key, val in sd_100k.items():
21
  new_key = key
22
  for i in range(10):
23
  new_key = new_key.replace(f'transformer.h.{i}.attn.', f'transformer.h.{i}.c_attn.')
@@ -59,6 +61,75 @@ def get_batch(vocab_size, block_size, device='cpu'):
59
  return torch.cat((x, torch.tensor([vocab_size]), vals), dim=0).unsqueeze(0).to(device)
60
 
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  def compute_cinclogits(model, config, device, attn_layer, num_tries=100):
63
  bs = config.block_size
64
  vs = config.vocab_size
@@ -87,7 +158,10 @@ def compute_cinclogits(model, config, device, attn_layer, num_tries=100):
87
  return acc_cl / num_tries, acc_icl / num_tries
88
 
89
 
90
- def compute_intensity(model, config, device, attn_layer, ub=5, min_valid=200):
 
 
 
91
  bs = config.block_size
92
  vs = config.vocab_size
93
  location = bs + 5
@@ -102,8 +176,8 @@ def compute_intensity(model, config, device, attn_layer, ub=5, min_valid=200):
102
  im = GPTIntervention(model, idx)
103
  im.intervent_attention(
104
  attention_layer_num=attn_layer, location=location,
105
- unsorted_lb=ub, unsorted_ub=ub,
106
- unsorted_lb_num=0, unsorted_ub_num=1,
107
  unsorted_intensity_inc=intens,
108
  sorted_lb=0, sorted_num=0, sorted_intensity_inc=0.0)
109
  g, n = im.check_if_still_works()
@@ -116,106 +190,197 @@ def compute_intensity(model, config, device, attn_layer, ub=5, min_valid=200):
116
  return np.array(intensities), np.array(rates), np.array(counts)
117
 
118
 
119
- def compute_ablation(model, config, device, skip_layer, num_trials=500):
 
 
120
  bs = config.block_size
121
  vs = config.vocab_size
122
- block = model.transformer.h[skip_layer]
123
- orig_fwd = block.forward
124
 
125
- def skip_attn(x, layer_n=-1):
126
- return x + block.c_fc(block.ln_2(x))
127
- block.forward = skip_attn
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
 
129
- pp = np.zeros(bs)
130
- fs = 0
131
- cc = np.zeros(bs)
132
- ce = np.zeros(bs)
133
- try:
134
- for _ in range(num_trials):
135
- idx = get_batch(vs, bs, device)
136
  with torch.no_grad():
137
  logits, _ = model(idx)
138
- preds = torch.argmax(logits[0, bs:2*bs, :], dim=1)
139
- targets = idx[0, bs+1:]
140
- correct = (preds == targets).cpu().numpy()
141
- pp += correct
142
- if correct.all():
143
- fs += 1
144
- ok = True
145
- for i in range(bs):
146
- if ok:
147
- ce[i] += 1
148
- if correct[i]:
149
- cc[i] += 1
150
- else:
151
- ok = False
152
- else:
153
- break
154
- finally:
155
- block.forward = orig_fwd
156
- return pp / num_trials, fs / num_trials, np.where(ce > 0, cc / ce, 0.0), ce
157
 
 
 
158
 
159
- def compute_baseline(model, config, device, num_trials=500):
 
 
 
 
 
 
160
  bs = config.block_size
161
  vs = config.vocab_size
162
- pp = np.zeros(bs)
163
- fs = 0
164
- cc = np.zeros(bs)
165
- ce = np.zeros(bs)
166
- for _ in range(num_trials):
 
167
  idx = get_batch(vs, bs, device)
168
  with torch.no_grad():
169
  logits, _ = model(idx)
170
- preds = torch.argmax(logits[0, bs:2*bs, :], dim=1)
171
- targets = idx[0, bs+1:]
172
- correct = (preds == targets).cpu().numpy()
173
- pp += correct
174
- if correct.all():
175
- fs += 1
176
- ok = True
177
- for i in range(bs):
178
- if ok:
179
- ce[i] += 1
180
- if correct[i]:
181
- cc[i] += 1
182
- else:
183
- ok = False
184
- else:
185
- break
186
- return pp / num_trials, fs / num_trials, np.where(ce > 0, cc / ce, 0.0), ce
187
-
188
-
189
- def process_task(task, model, config, device, out_dir, itr):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
  task_type = task['type']
191
  out_path = task['out']
192
  if os.path.exists(out_path):
193
  return True
194
 
195
  os.makedirs(os.path.dirname(out_path), exist_ok=True)
 
196
 
197
  if task_type == 'baseline':
198
  pp, fs, ca, ce = compute_baseline(model, config, device)
199
  np.savez(out_path, per_pos_acc=pp, full_seq_acc=fs,
200
  cond_acc=ca, cond_eligible=ce, itr=itr)
 
201
  elif task_type == 'ablation':
202
  pp, fs, ca, ce = compute_ablation(model, config, device, task['layer'])
203
  np.savez(out_path, per_pos_acc=pp, full_seq_acc=fs,
204
  cond_acc=ca, cond_eligible=ce, skip_layer=task['layer'], itr=itr)
 
205
  elif task_type == 'cinclogits':
206
  cl, icl = compute_cinclogits(model, config, device, task['layer'])
207
  np.savez(out_path, clogit_icscore=cl, iclogit_icscore=icl, itr=itr)
 
208
  elif task_type == 'intensity':
209
  intensities, rates, counts = compute_intensity(
210
  model, config, device, task['layer'], ub=task['ub'])
211
  np.savez(out_path, intensities=intensities, success_rates=rates,
212
  counts=counts, itr=itr)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  return True
214
 
215
 
216
  def main():
217
  parser = argparse.ArgumentParser()
218
- parser.add_argument('--tasks-file', required=True, help='JSON file with task list')
219
  parser.add_argument('--gpu', type=int, required=True)
220
  args = parser.parse_args()
221
 
@@ -225,8 +390,7 @@ def main():
225
  with open(args.tasks_file) as f:
226
  task_list = json.load(f)
227
 
228
- print(f"GPU {args.gpu}: {len(task_list)} tasks across "
229
- f"{len(set(t['ckpt_path'] for t in task_list))} checkpoints", flush=True)
230
 
231
  current_model = None
232
  current_ckpt = None
@@ -234,21 +398,18 @@ def main():
234
 
235
  for task in task_list:
236
  ckpt_path = task['ckpt_path']
237
-
238
  if ckpt_path != current_ckpt:
239
  t0 = time.time()
240
  model, config = load_model(ckpt_path, device)
241
  current_model = model
242
  current_ckpt = ckpt_path
243
- itr = task.get('itr', 100000)
244
  print(f" Loaded {os.path.basename(ckpt_path)} ({time.time()-t0:.1f}s)", flush=True)
245
 
246
  t0 = time.time()
247
  try:
248
- process_task(task, current_model, config, device, None, itr)
249
  dt = time.time() - t0
250
  done += 1
251
- # Print status as JSON for launcher to parse
252
  print(json.dumps({
253
  'status': 'done', 'task': task['name'],
254
  'gpu': args.gpu, 'elapsed': round(dt, 1),
@@ -261,6 +422,8 @@ def main():
261
  'gpu': args.gpu, 'error': str(e)
262
  }), flush=True)
263
 
 
 
264
 
265
  if __name__ == '__main__':
266
  main()
 
1
  """
2
+ GPU worker for 1000k-checkpoint analysis.
3
+ Processes all task types on a single GPU: baseline, ablation, cinclogits,
4
+ intensity (various ub), asymmetric intensity, hijack, separator/random.
5
  """
6
  import argparse
7
  import json
8
  import os
9
  import sys
10
  import time
11
+ import types
12
  import numpy as np
13
  import torch
14
+ import torch.nn.functional as F
15
 
16
  sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'grid-run'))
17
  from model_analysis import GPT, GPTConfig, GPTIntervention
18
 
19
 
20
+ def remap_state_dict(sd):
21
  new_sd = {}
22
+ for key, val in sd.items():
23
  new_key = key
24
  for i in range(10):
25
  new_key = new_key.replace(f'transformer.h.{i}.attn.', f'transformer.h.{i}.c_attn.')
 
61
  return torch.cat((x, torch.tensor([vocab_size]), vals), dim=0).unsqueeze(0).to(device)
62
 
63
 
64
+ def compute_baseline(model, config, device, num_trials=500):
65
+ bs = config.block_size
66
+ vs = config.vocab_size
67
+ pp = np.zeros(bs)
68
+ fs = 0
69
+ cc = np.zeros(bs)
70
+ ce = np.zeros(bs)
71
+ for _ in range(num_trials):
72
+ idx = get_batch(vs, bs, device)
73
+ with torch.no_grad():
74
+ logits, _ = model(idx)
75
+ preds = torch.argmax(logits[0, bs:2*bs, :], dim=1)
76
+ targets = idx[0, bs+1:]
77
+ correct = (preds == targets).cpu().numpy()
78
+ pp += correct
79
+ if correct.all():
80
+ fs += 1
81
+ ok = True
82
+ for i in range(bs):
83
+ if ok:
84
+ ce[i] += 1
85
+ if correct[i]:
86
+ cc[i] += 1
87
+ else:
88
+ ok = False
89
+ else:
90
+ break
91
+ return pp / num_trials, fs / num_trials, np.where(ce > 0, cc / ce, 0.0), ce
92
+
93
+
94
+ def compute_ablation(model, config, device, skip_layer, num_trials=500):
95
+ bs = config.block_size
96
+ block = model.transformer.h[skip_layer]
97
+ orig_fwd = block.forward
98
+
99
+ def skip_attn(x, layer_n=-1):
100
+ return x + block.c_fc(block.ln_2(x))
101
+ block.forward = skip_attn
102
+
103
+ pp = np.zeros(bs)
104
+ fs = 0
105
+ cc = np.zeros(bs)
106
+ ce = np.zeros(bs)
107
+ try:
108
+ for _ in range(num_trials):
109
+ idx = get_batch(config.vocab_size, bs, device)
110
+ with torch.no_grad():
111
+ logits, _ = model(idx)
112
+ preds = torch.argmax(logits[0, bs:2*bs, :], dim=1)
113
+ targets = idx[0, bs+1:]
114
+ correct = (preds == targets).cpu().numpy()
115
+ pp += correct
116
+ if correct.all():
117
+ fs += 1
118
+ ok = True
119
+ for i in range(bs):
120
+ if ok:
121
+ ce[i] += 1
122
+ if correct[i]:
123
+ cc[i] += 1
124
+ else:
125
+ ok = False
126
+ else:
127
+ break
128
+ finally:
129
+ block.forward = orig_fwd
130
+ return pp / num_trials, fs / num_trials, np.where(ce > 0, cc / ce, 0.0), ce
131
+
132
+
133
  def compute_cinclogits(model, config, device, attn_layer, num_tries=100):
134
  bs = config.block_size
135
  vs = config.vocab_size
 
158
  return acc_cl / num_tries, acc_icl / num_tries
159
 
160
 
161
+ def compute_intensity(model, config, device, attn_layer, ub=5, lb=None,
162
+ ub_num=1, lb_num=0, min_valid=200):
163
+ if lb is None:
164
+ lb = ub
165
  bs = config.block_size
166
  vs = config.vocab_size
167
  location = bs + 5
 
176
  im = GPTIntervention(model, idx)
177
  im.intervent_attention(
178
  attention_layer_num=attn_layer, location=location,
179
+ unsorted_lb=lb, unsorted_ub=ub,
180
+ unsorted_lb_num=lb_num, unsorted_ub_num=ub_num,
181
  unsorted_intensity_inc=intens,
182
  sorted_lb=0, sorted_num=0, sorted_intensity_inc=0.0)
183
  g, n = im.check_if_still_works()
 
190
  return np.array(intensities), np.array(rates), np.array(counts)
191
 
192
 
193
+ def compute_hijack(model, config, device, n_trials=2000):
194
+ """Hijack intervention on layer 0. Returns array of (current, boosted, predicted, correct)."""
195
+ INTENSITY = 10.0
196
  bs = config.block_size
197
  vs = config.vocab_size
198
+ attn_module = model.transformer.h[0].c_attn
199
+ records = []
200
 
201
+ for trial in range(n_trials):
202
+ idx = get_batch(vs, bs, device)
203
+ unsorted = idx[0, :bs]
204
+ sorted_part = idx[0, bs + 1: 2 * bs + 1]
205
+
206
+ with torch.no_grad():
207
+ _, _ = model(idx)
208
+ raw_attn = attn_module.raw_attn.clone()
209
+
210
+ for p in range(bs - 1):
211
+ location = bs + 1 + p
212
+ current_num = sorted_part[p].item()
213
+ correct_next = idx[0, location + 1].item()
214
+
215
+ next_loc_in_unsorted = (unsorted == correct_next).nonzero(as_tuple=True)[0]
216
+ if len(next_loc_in_unsorted) == 0:
217
+ continue
218
+ next_loc = next_loc_in_unsorted[0].item()
219
+ main_attn_val = raw_attn[location, next_loc].item()
220
+
221
+ candidates = [i for i in range(bs) if unsorted[i].item() != correct_next]
222
+ if not candidates:
223
+ continue
224
+
225
+ boost_idx = candidates[torch.randint(len(candidates), (1,)).item()]
226
+ boosted_number = unsorted[boost_idx].item()
227
+
228
+ def make_new_forward(loc, bidx, mav):
229
+ def new_forward(self_attn, x, layer_n=-1):
230
+ B, T, C = x.size()
231
+ qkv = self_attn.c_attn(x)
232
+ q, k, v = qkv.split(self_attn.n_embd, dim=2)
233
+ q = q.view(B, T, self_attn.n_heads, C // self_attn.n_heads).transpose(1, 2)
234
+ k = k.view(B, T, self_attn.n_heads, C // self_attn.n_heads).transpose(1, 2)
235
+ v = v.view(B, T, self_attn.n_heads, C // self_attn.n_heads).transpose(1, 2)
236
+ attn = q @ k.transpose(-1, -2) * 0.1 / (k.size(-1)) ** 0.5
237
+ attn[:, :, loc, bidx] = mav + INTENSITY
238
+ attn = attn.masked_fill(self_attn.bias[:, :, :T, :T] == 0, float('-inf'))
239
+ attn = F.softmax(attn, dim=-1)
240
+ y = attn @ v
241
+ y = y.transpose(1, 2).contiguous().view(B, T, C)
242
+ y = self_attn.c_proj(y)
243
+ return y
244
+ return new_forward
245
+
246
+ old_forward = attn_module.forward
247
+ attn_module.forward = types.MethodType(
248
+ make_new_forward(location, boost_idx, main_attn_val), attn_module)
249
 
 
 
 
 
 
 
 
250
  with torch.no_grad():
251
  logits, _ = model(idx)
252
+ predicted = torch.argmax(logits, dim=-1)[0, location].item()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
 
254
+ attn_module.forward = old_forward
255
+ records.append((current_num, boosted_number, predicted, correct_next))
256
 
257
+ return np.array(records, dtype=np.int32) if records else np.empty((0, 4), dtype=np.int32)
258
+
259
+
260
+ def compute_separator_random(model, config, device, n_trials=1000):
261
+ """Separator-attention and random-target intervention on layer 0."""
262
+ INTENSITIES = [2.0, 6.0, 10.0]
263
+ UB_STANDARD = 60
264
  bs = config.block_size
265
  vs = config.vocab_size
266
+ sep_pos = bs
267
+
268
+ sep_records = []
269
+ rand_records = []
270
+
271
+ for trial in range(n_trials):
272
  idx = get_batch(vs, bs, device)
273
  with torch.no_grad():
274
  logits, _ = model(idx)
275
+ attn_layer0 = model.transformer.h[0].c_attn.attn
276
+
277
+ for p in range(bs - 1):
278
+ sorted_loc = bs + 1 + p
279
+ number_val = idx[0, sorted_loc].item()
280
+
281
+ attn_row = attn_layer0[sorted_loc, :sorted_loc + 1]
282
+ max_attn_pos = attn_row.argmax().item()
283
+ attends_to_sep = (max_attn_pos == sep_pos)
284
+
285
+ for intensity in INTENSITIES:
286
+ if attends_to_sep:
287
+ try:
288
+ im = GPTIntervention(model, idx)
289
+ im.intervent_attention(
290
+ attention_layer_num=0, location=sorted_loc,
291
+ unsorted_lb=UB_STANDARD, unsorted_ub=UB_STANDARD,
292
+ unsorted_lb_num=0, unsorted_ub_num=1,
293
+ unsorted_intensity_inc=intensity,
294
+ sorted_lb=0, sorted_num=0, sorted_intensity_inc=0.0)
295
+ g, n = im.check_if_still_works()
296
+ im.revert_attention(0)
297
+ sep_records.append((number_val, intensity, int(g == n)))
298
+ except:
299
+ pass
300
+
301
+ try:
302
+ im = GPTIntervention(model, idx)
303
+ im.intervent_attention(
304
+ attention_layer_num=0, location=sorted_loc,
305
+ unsorted_lb=0, unsorted_ub=vs,
306
+ unsorted_lb_num=0, unsorted_ub_num=1,
307
+ unsorted_intensity_inc=intensity,
308
+ sorted_lb=0, sorted_num=0, sorted_intensity_inc=0.0)
309
+ g, n = im.check_if_still_works()
310
+ im.revert_attention(0)
311
+ rand_records.append((number_val, intensity, int(g == n)))
312
+ except:
313
+ try:
314
+ im = GPTIntervention(model, idx)
315
+ im.intervent_attention(
316
+ attention_layer_num=0, location=sorted_loc,
317
+ unsorted_lb=vs, unsorted_ub=0,
318
+ unsorted_lb_num=1, unsorted_ub_num=0,
319
+ unsorted_intensity_inc=intensity,
320
+ sorted_lb=0, sorted_num=0, sorted_intensity_inc=0.0)
321
+ g, n = im.check_if_still_works()
322
+ im.revert_attention(0)
323
+ rand_records.append((number_val, intensity, int(g == n)))
324
+ except:
325
+ pass
326
+
327
+ sep = np.array(sep_records, dtype=np.int32) if sep_records else np.empty((0, 3), dtype=np.int32)
328
+ rand = np.array(rand_records, dtype=np.int32) if rand_records else np.empty((0, 3), dtype=np.int32)
329
+ return sep, rand
330
+
331
+
332
+ def process_task(task, model, config, device):
333
  task_type = task['type']
334
  out_path = task['out']
335
  if os.path.exists(out_path):
336
  return True
337
 
338
  os.makedirs(os.path.dirname(out_path), exist_ok=True)
339
+ itr = task.get('itr', 0)
340
 
341
  if task_type == 'baseline':
342
  pp, fs, ca, ce = compute_baseline(model, config, device)
343
  np.savez(out_path, per_pos_acc=pp, full_seq_acc=fs,
344
  cond_acc=ca, cond_eligible=ce, itr=itr)
345
+
346
  elif task_type == 'ablation':
347
  pp, fs, ca, ce = compute_ablation(model, config, device, task['layer'])
348
  np.savez(out_path, per_pos_acc=pp, full_seq_acc=fs,
349
  cond_acc=ca, cond_eligible=ce, skip_layer=task['layer'], itr=itr)
350
+
351
  elif task_type == 'cinclogits':
352
  cl, icl = compute_cinclogits(model, config, device, task['layer'])
353
  np.savez(out_path, clogit_icscore=cl, iclogit_icscore=icl, itr=itr)
354
+
355
  elif task_type == 'intensity':
356
  intensities, rates, counts = compute_intensity(
357
  model, config, device, task['layer'], ub=task['ub'])
358
  np.savez(out_path, intensities=intensities, success_rates=rates,
359
  counts=counts, itr=itr)
360
+
361
+ elif task_type == 'intensity_asym':
362
+ intensities, rates, counts = compute_intensity(
363
+ model, config, device, task['layer'],
364
+ ub=task['unsorted_ub'], lb=task['unsorted_lb'],
365
+ ub_num=task['unsorted_ub_num'], lb_num=task['unsorted_lb_num'])
366
+ np.savez(out_path, intensities=intensities, success_rates=rates,
367
+ counts=counts, itr=itr)
368
+
369
+ elif task_type == 'hijack':
370
+ data = compute_hijack(model, config, device, n_trials=task.get('trials', 2000))
371
+ np.savez(out_path, data=data)
372
+
373
+ elif task_type == 'separator_random':
374
+ sep, rand = compute_separator_random(model, config, device,
375
+ n_trials=task.get('trials', 1000))
376
+ np.savez(out_path, sep_data=sep, rand_data=rand)
377
+
378
  return True
379
 
380
 
381
  def main():
382
  parser = argparse.ArgumentParser()
383
+ parser.add_argument('--tasks-file', required=True)
384
  parser.add_argument('--gpu', type=int, required=True)
385
  args = parser.parse_args()
386
 
 
390
  with open(args.tasks_file) as f:
391
  task_list = json.load(f)
392
 
393
+ print(f"GPU {args.gpu}: {len(task_list)} tasks", flush=True)
 
394
 
395
  current_model = None
396
  current_ckpt = None
 
398
 
399
  for task in task_list:
400
  ckpt_path = task['ckpt_path']
 
401
  if ckpt_path != current_ckpt:
402
  t0 = time.time()
403
  model, config = load_model(ckpt_path, device)
404
  current_model = model
405
  current_ckpt = ckpt_path
 
406
  print(f" Loaded {os.path.basename(ckpt_path)} ({time.time()-t0:.1f}s)", flush=True)
407
 
408
  t0 = time.time()
409
  try:
410
+ process_task(task, current_model, config, device)
411
  dt = time.time() - t0
412
  done += 1
 
413
  print(json.dumps({
414
  'status': 'done', 'task': task['name'],
415
  'gpu': args.gpu, 'elapsed': round(dt, 1),
 
422
  'gpu': args.gpu, 'error': str(e)
423
  }), flush=True)
424
 
425
+ print(f"GPU {args.gpu}: all done ({done}/{len(task_list)})", flush=True)
426
+
427
 
428
  if __name__ == '__main__':
429
  main()
hijack_layer1_worker.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Worker: run hijack intervention on layer 1 for a single checkpoint and plot heatmaps.
4
+ Usage: python hijack_layer1_worker.py <checkpoint.pt> --output-dir <dir>
5
+ """
6
+ import argparse
7
+ import os
8
+ import sys
9
+ import types
10
+ import numpy as np
11
+ import torch
12
+ import torch.nn.functional as F
13
+
14
+ import matplotlib
15
+ matplotlib.use('Agg')
16
+ import matplotlib.pyplot as plt
17
+
18
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'grid-run'))
19
+ from model_analysis import GPT, GPTConfig
20
+
21
+ BIN_SIZE = 8
22
+ N_BINS = 256 // BIN_SIZE
23
+ INTENSITY = 10.0
24
+ LAYER = 1
25
+ N_TRIALS = 2000
26
+
27
+
28
+ def remap_state_dict(sd):
29
+ new_sd = {}
30
+ for key, val in sd.items():
31
+ new_key = key
32
+ for i in range(10):
33
+ new_key = new_key.replace(f'transformer.h.{i}.attn.', f'transformer.h.{i}.c_attn.')
34
+ new_key = new_key.replace(f'transformer.h.{i}.mlp.', f'transformer.h.{i}.c_fc.')
35
+ new_sd[new_key] = val
36
+ return new_sd
37
+
38
+
39
+ def load_model(ckpt_path, device):
40
+ ckpt = torch.load(ckpt_path, map_location='cpu')
41
+ mc = ckpt['model_config']
42
+ vocab_size = mc['vocab_size'] - 1
43
+ block_size = mc['block_size']
44
+ with_layer_norm = mc.get('use_final_LN', True)
45
+
46
+ config = GPTConfig(block_size=block_size, vocab_size=vocab_size,
47
+ with_layer_norm=with_layer_norm)
48
+ model = GPT(config)
49
+
50
+ sd = remap_state_dict(ckpt['model_state_dict'])
51
+ grid_wpe_size = block_size * 4 + 1
52
+ if 'transformer.wpe.weight' in sd and sd['transformer.wpe.weight'].shape[0] > grid_wpe_size:
53
+ sd['transformer.wpe.weight'] = sd['transformer.wpe.weight'][:grid_wpe_size]
54
+ keys_to_skip = [k for k in sd if k.endswith('.c_attn.bias') and 'c_attn.c_attn' not in k]
55
+ for k in keys_to_skip:
56
+ del sd[k]
57
+ if 'lm_head.weight' in sd:
58
+ del sd['lm_head.weight']
59
+
60
+ model.load_state_dict(sd, strict=False)
61
+ model.to(device).eval()
62
+ return model, config
63
+
64
+
65
+ def get_batch(vocab_size, block_size, device='cpu'):
66
+ x = torch.randperm(vocab_size)[:block_size]
67
+ vals, _ = torch.sort(x)
68
+ return torch.cat((x, torch.tensor([vocab_size]), vals), dim=0).unsqueeze(0).to(device)
69
+
70
+
71
+ def compute_hijack(model, config, device):
72
+ bs = config.block_size
73
+ vs = config.vocab_size
74
+ attn_module = model.transformer.h[LAYER].c_attn
75
+ records = []
76
+
77
+ for trial in range(N_TRIALS):
78
+ idx = get_batch(vs, bs, device)
79
+ unsorted = idx[0, :bs]
80
+ sorted_part = idx[0, bs + 1: 2 * bs + 1]
81
+
82
+ with torch.no_grad():
83
+ _, _ = model(idx)
84
+ raw_attn = attn_module.raw_attn.clone()
85
+
86
+ for p in range(bs - 1):
87
+ location = bs + 1 + p
88
+ current_num = sorted_part[p].item()
89
+ correct_next = idx[0, location + 1].item()
90
+
91
+ next_loc_in_unsorted = (unsorted == correct_next).nonzero(as_tuple=True)[0]
92
+ if len(next_loc_in_unsorted) == 0:
93
+ continue
94
+ next_loc = next_loc_in_unsorted[0].item()
95
+ main_attn_val = raw_attn[location, next_loc].item()
96
+
97
+ candidates = [i for i in range(bs) if unsorted[i].item() != correct_next]
98
+ if not candidates:
99
+ continue
100
+
101
+ boost_idx = candidates[torch.randint(len(candidates), (1,)).item()]
102
+ boosted_number = unsorted[boost_idx].item()
103
+
104
+ def make_new_forward(loc, bidx, mav):
105
+ def new_forward(self_attn, x, layer_n=-1):
106
+ B, T, C = x.size()
107
+ qkv = self_attn.c_attn(x)
108
+ q, k, v = qkv.split(self_attn.n_embd, dim=2)
109
+ q = q.view(B, T, self_attn.n_heads, C // self_attn.n_heads).transpose(1, 2)
110
+ k = k.view(B, T, self_attn.n_heads, C // self_attn.n_heads).transpose(1, 2)
111
+ v = v.view(B, T, self_attn.n_heads, C // self_attn.n_heads).transpose(1, 2)
112
+ attn = q @ k.transpose(-1, -2) * 0.1 / (k.size(-1)) ** 0.5
113
+ attn[:, :, loc, bidx] = mav + INTENSITY
114
+ attn = attn.masked_fill(self_attn.bias[:, :, :T, :T] == 0, float('-inf'))
115
+ attn = F.softmax(attn, dim=-1)
116
+ y = attn @ v
117
+ y = y.transpose(1, 2).contiguous().view(B, T, C)
118
+ y = self_attn.c_proj(y)
119
+ return y
120
+ return new_forward
121
+
122
+ old_forward = attn_module.forward
123
+ attn_module.forward = types.MethodType(
124
+ make_new_forward(location, boost_idx, main_attn_val), attn_module)
125
+
126
+ with torch.no_grad():
127
+ logits, _ = model(idx)
128
+ predicted = torch.argmax(logits, dim=-1)[0, location].item()
129
+
130
+ attn_module.forward = old_forward
131
+ records.append((current_num, boosted_number, predicted, correct_next))
132
+
133
+ return np.array(records, dtype=np.int32) if records else np.empty((0, 4), dtype=np.int32)
134
+
135
+
136
+ def plot_heatmaps(data, plot_dir, tag):
137
+ if len(data) == 0:
138
+ print("No data to plot!")
139
+ return
140
+
141
+ current = data[:, 0]; boosted = data[:, 1]
142
+ predicted = data[:, 2]; correct = data[:, 3]
143
+ broken = (predicted != correct).astype(np.float64)
144
+ hijacked = (predicted == boosted).astype(np.float64)
145
+ cur_bin = np.clip(current // BIN_SIZE, 0, N_BINS - 1)
146
+ bst_bin = np.clip(boosted // BIN_SIZE, 0, N_BINS - 1)
147
+
148
+ break_map = np.full((N_BINS, N_BINS), np.nan)
149
+ hijack_map = np.full((N_BINS, N_BINS), np.nan)
150
+ count_map = np.zeros((N_BINS, N_BINS), dtype=int)
151
+ for cb in range(N_BINS):
152
+ for bb in range(N_BINS):
153
+ mask = (cur_bin == cb) & (bst_bin == bb)
154
+ n = mask.sum()
155
+ count_map[cb, bb] = n
156
+ if n >= 5:
157
+ break_map[cb, bb] = broken[mask].mean()
158
+ hijack_map[cb, bb] = hijacked[mask].mean()
159
+
160
+ tick_labels = [f'{i * BIN_SIZE}' for i in range(0, N_BINS, 4)]
161
+ tick_positions = list(range(0, N_BINS, 4))
162
+
163
+ for arr, cmap, label, fname in [
164
+ (break_map, 'YlOrRd', 'Breaking Rate',
165
+ f'hijack_breaking_rate_heatmap_layer{LAYER}.png'),
166
+ (hijack_map, 'YlOrRd', 'Hijack Rate',
167
+ f'hijack_hijack_rate_heatmap_layer{LAYER}.png'),
168
+ ]:
169
+ fig, ax = plt.subplots(figsize=(10, 8.5))
170
+ im = ax.imshow(arr, aspect='auto', cmap=cmap, vmin=0, vmax=1,
171
+ interpolation='nearest', origin='lower')
172
+ ax.set_xlabel('Intervened-toward Number (binned)', fontsize=12)
173
+ ax.set_ylabel('Current Number (binned)', fontsize=12)
174
+ title_map = {'Breaking Rate': 'Breaking Rate: P(pred \u2260 correct)',
175
+ 'Hijack Rate': 'Hijack Rate: P(pred == intervened target)'}
176
+ ax.set_title(f'{title_map[label]}\n{tag} layer={LAYER} intensity={INTENSITY}',
177
+ fontsize=12, fontweight='bold')
178
+ ax.set_xticks(tick_positions); ax.set_xticklabels(tick_labels, fontsize=8)
179
+ ax.set_yticks(tick_positions); ax.set_yticklabels(tick_labels, fontsize=8)
180
+ plt.colorbar(im, ax=ax, label=label, shrink=0.85)
181
+ fig.tight_layout()
182
+ fig.savefig(os.path.join(plot_dir, fname), dpi=200, bbox_inches='tight')
183
+ plt.close()
184
+ print(f"Saved: {fname}")
185
+
186
+ fig, ax = plt.subplots(figsize=(10, 8.5))
187
+ im = ax.imshow(count_map, aspect='auto', cmap='viridis',
188
+ interpolation='nearest', origin='lower')
189
+ ax.set_xlabel('Intervened-toward Number (binned)', fontsize=12)
190
+ ax.set_ylabel('Current Number (binned)', fontsize=12)
191
+ ax.set_title(f'Sample Count per (current, target) bin\n{tag} layer={LAYER} intensity={INTENSITY}',
192
+ fontsize=11, fontweight='bold')
193
+ ax.set_xticks(tick_positions); ax.set_xticklabels(tick_labels, fontsize=8)
194
+ ax.set_yticks(tick_positions); ax.set_yticklabels(tick_labels, fontsize=8)
195
+ plt.colorbar(im, ax=ax, label='Count', shrink=0.85)
196
+ fig.tight_layout()
197
+ fname = f'hijack_sample_count_heatmap_layer{LAYER}.png'
198
+ fig.savefig(os.path.join(plot_dir, fname), dpi=200, bbox_inches='tight')
199
+ plt.close()
200
+ print(f"Saved: {fname}")
201
+
202
+
203
+ def main():
204
+ parser = argparse.ArgumentParser()
205
+ parser.add_argument('checkpoint', type=str)
206
+ parser.add_argument('--output-dir', type=str, required=True)
207
+ args = parser.parse_args()
208
+
209
+ device = 'cuda'
210
+ os.makedirs(args.output_dir, exist_ok=True)
211
+
212
+ print(f"Loading {os.path.basename(args.checkpoint)} ...", flush=True)
213
+ model, config = load_model(args.checkpoint, device)
214
+
215
+ print(f"Running hijack layer {LAYER} ({N_TRIALS} trials) ...", flush=True)
216
+ data = compute_hijack(model, config, device)
217
+ print(f"Collected {len(data)} records", flush=True)
218
+
219
+ bn = os.path.basename(args.checkpoint).replace('.pt', '')
220
+ parts = bn.split('__')
221
+ ckpt_type = parts[1] if len(parts) > 1 else 'final'
222
+ itr = int(ckpt_type.replace('ckpt', '')) if ckpt_type.startswith('ckpt') else 1000000
223
+ tag = f"V=256 B=16 lr=0.03 iters={itr} dseed=1337 iseed=1337"
224
+
225
+ plot_heatmaps(data, args.output_dir, tag)
226
+
227
+
228
+ if __name__ == '__main__':
229
+ main()
model_tbyt_train.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Model matching the 200k-checkpoints architecture exactly.
3
+ Block uses self.attn / self.mlp naming (matching 200k state dict).
4
+ max_seq_len configurable (200k model uses 193).
5
+ """
6
+ import torch
7
+ import torch.nn as nn
8
+ import torch.nn.functional as F
9
+ import math
10
+
11
+
12
+ class MLP(nn.Module):
13
+ def __init__(self, config):
14
+ super().__init__()
15
+ self.fc_1 = nn.Linear(config.n_embd, 3 * config.n_embd)
16
+ self.gelu = nn.GELU(approximate='tanh')
17
+ self.fc_2 = nn.Linear(config.n_embd * 3, config.n_embd)
18
+ self.NANO_SCALE_GPT = True
19
+
20
+ def forward(self, x):
21
+ return self.fc_2(self.gelu(self.fc_1(x)))
22
+
23
+
24
+ class CasualSelfAttention(nn.Module):
25
+ def __init__(self, config):
26
+ super().__init__()
27
+ self.n_embd = config.n_embd
28
+ self.n_heads = config.n_heads
29
+ self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
30
+ self.c_proj = nn.Linear(config.n_embd, config.n_embd)
31
+ seq_len = config.max_seq_len
32
+ self.register_buffer('bias', torch.tril(torch.ones(seq_len, seq_len)).view(1, 1, seq_len, seq_len))
33
+ self.c_proj.NANOGPT_SCALE_INIT = True
34
+ self.config = config
35
+
36
+ def forward(self, x, layer_n=-1):
37
+ B, T, C = x.size()
38
+ qkv = self.c_attn(x)
39
+ q, k, v = qkv.split(self.n_embd, dim=2)
40
+ q = q.view(B, T, self.n_heads, C // self.n_heads).transpose(1, 2)
41
+ k = k.view(B, T, self.n_heads, C // self.n_heads).transpose(1, 2)
42
+ v = v.view(B, T, self.n_heads, C // self.n_heads).transpose(1, 2)
43
+ attn = q @ k.transpose(-1, -2) * 0.1 / (k.size(-1)) ** 0.5
44
+ attn = attn.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
45
+ attn = F.softmax(attn, dim=-1)
46
+ y = attn @ v
47
+ y = y.transpose(1, 2).contiguous().view(B, T, C)
48
+ y = self.c_proj(y)
49
+ return y
50
+
51
+
52
+ class Block(nn.Module):
53
+ def __init__(self, config):
54
+ super().__init__()
55
+ self.attn = CasualSelfAttention(config)
56
+ self.mlp = MLP(config)
57
+ self.ln_1 = nn.LayerNorm(config.n_embd)
58
+ self.ln_2 = nn.LayerNorm(config.n_embd)
59
+
60
+ def forward(self, x, layer_n=-1):
61
+ x = x + self.attn(self.ln_1(x), layer_n=layer_n)
62
+ return x + self.mlp(self.ln_2(x))
63
+
64
+
65
+ class GPT(nn.Module):
66
+ def __init__(self, config):
67
+ super().__init__()
68
+ self.config = config
69
+ self.n_layers = config.n_layers
70
+ self.transformer = nn.ModuleDict(dict(
71
+ wte=nn.Embedding(config.vocab_size + 1, config.n_embd),
72
+ wpe=nn.Embedding(config.max_seq_len, config.n_embd),
73
+ h=nn.ModuleList([Block(config) for _ in range(config.n_layers)]),
74
+ ln_f=nn.LayerNorm(config.n_embd)
75
+ ))
76
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
77
+ self.lm_head.weight = self.transformer.wte.weight
78
+ self.apply(self._init_weights)
79
+
80
+ def _init_weights(self, module):
81
+ std = 0.02
82
+ if isinstance(module, nn.Linear):
83
+ if hasattr(module, 'NANOGPT_SCALE_INIT'):
84
+ std *= (2 * self.n_layers) ** -0.5
85
+ torch.nn.init.normal_(module.weight, mean=0, std=std)
86
+ if module.bias is not None:
87
+ torch.nn.init.zeros_(module.bias)
88
+ if isinstance(module, nn.Embedding):
89
+ torch.nn.init.normal_(module.weight, mean=0, std=std)
90
+
91
+ def forward(self, idx, targets=None, flag=False):
92
+ B, T = idx.size()
93
+ x = self.transformer.wte(idx)
94
+ layer_n = 0
95
+ for block in self.transformer.h:
96
+ layer_n += 1
97
+ x = block(x, layer_n)
98
+ if self.config.with_layer_norm:
99
+ x = self.transformer.ln_f(x)
100
+ logits = self.lm_head(x)
101
+
102
+ tensor1 = logits[:, self.config.block_size:T - 1, :].contiguous().view(-1, logits.size(-1))
103
+ tensor2 = idx[:, self.config.block_size + 1:].contiguous().view(-1)
104
+ loss = F.cross_entropy(tensor1, tensor2)
105
+ return logits, loss
106
+
107
+
108
+ class GPTConfig:
109
+ block_size: int = 16
110
+ vocab_size: int = 256
111
+ n_layers: int = 2
112
+ n_heads: int = 1
113
+ n_embd: int = 64
114
+ with_layer_norm: bool = True
115
+ max_seq_len: int = 193
116
+
117
+ def __init__(self, block_size=None, vocab_size=None, with_layer_norm=True, max_seq_len=193):
118
+ if block_size is not None:
119
+ self.block_size = block_size
120
+ if vocab_size is not None:
121
+ self.vocab_size = vocab_size
122
+ self.with_layer_norm = with_layer_norm
123
+ self.max_seq_len = max_seq_len
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/ablation_accuracy.png ADDED
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/ablation_conditional_accuracy.png ADDED

Git LFS Details

  • SHA256: fae10f51c5b6b25cfc31bb8cbb6e3e29483888cdee11137553af94f2f95e6651
  • Pointer size: 131 Bytes
  • Size of remote file: 137 kB
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/ablation_per_position.png ADDED

Git LFS Details

  • SHA256: 280d85628c5e03d612d1f0ed485ea7acbcd8bc374a65c1278a64ddbca1a47dd7
  • Pointer size: 131 Bytes
  • Size of remote file: 124 kB
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/attn_heatmaps.png ADDED

Git LFS Details

  • SHA256: fd088ed11b1ae6cbba95371546304b47b4fee2423b855b67807771531b9bc619
  • Pointer size: 131 Bytes
  • Size of remote file: 117 kB
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/avg_attn_by_number_layer0.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3f1680ae8653c687e1aa97f956d9f07eeb969bb1404087097c4c1a799ecb6fe
3
+ size 526858
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/avg_attn_by_number_layer0.png ADDED

Git LFS Details

  • SHA256: e44904c556676242f1333664df3520613af6f86b1f3b5ad99abed7665fdacb27
  • Pointer size: 131 Bytes
  • Size of remote file: 144 kB
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/avg_attn_by_number_layer1.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2174fce6c178f07e77275564dc56614aa037372e9610bdb52af9623b5f0680a3
3
+ size 526858
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/avg_attn_by_number_layer1.png ADDED

Git LFS Details

  • SHA256: 87bfd7e2a3d42e66320e03576f0a5917d09194888360bd188829b4d08a4bc97a
  • Pointer size: 131 Bytes
  • Size of remote file: 186 kB
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/baseline_accuracy.png ADDED
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/baseline_conditional_accuracy.png ADDED

Git LFS Details

  • SHA256: fb85a2c46c790ed4f8b7a63b8dfdae49ae3db9ec2fa9eaafeaa91c7675d1692b
  • Pointer size: 131 Bytes
  • Size of remote file: 104 kB
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/cinclogits_layer0.png ADDED

Git LFS Details

  • SHA256: a9988ed5bb5feed6b466aa5e73bccad5a03cf02452d1ce911b1177f284aba129
  • Pointer size: 131 Bytes
  • Size of remote file: 104 kB
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/cinclogits_layer1.png ADDED

Git LFS Details

  • SHA256: 7b7160129b67bef3f0a4786a97fe6901a066e52ae79f93384558cf16ef3945c3
  • Pointer size: 131 Bytes
  • Size of remote file: 101 kB
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/hijack_breaking_rate_bynext_heatmap_layer0.png ADDED

Git LFS Details

  • SHA256: 7e17a431eedda876718314acc3b2b181c98d4b25fbfc5d433f2cf09fd4525ad0
  • Pointer size: 131 Bytes
  • Size of remote file: 107 kB
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/hijack_breaking_rate_heatmap_layer0.png ADDED

Git LFS Details

  • SHA256: e3c5106599602dc40b469c264416b738bc7366e1807717995cba204fa73b32a7
  • Pointer size: 131 Bytes
  • Size of remote file: 100 kB
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/hijack_breaking_rate_heatmap_layer1.png ADDED
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/hijack_hijack_rate_bynext_heatmap_layer0.png ADDED

Git LFS Details

  • SHA256: 43840a185a0a85ede33ce834c9a2db3de9f66e12e2b92db7a8f13b9d0d1d323b
  • Pointer size: 131 Bytes
  • Size of remote file: 103 kB
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/hijack_hijack_rate_heatmap_layer0.png ADDED
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/hijack_hijack_rate_heatmap_layer1.png ADDED

Git LFS Details

  • SHA256: 1ac9805a37a5f9f22a5796793e3d9d7ddb4d5509ab15e59a1b069dfb4a494ef7
  • Pointer size: 131 Bytes
  • Size of remote file: 103 kB
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/hijack_sample_count_heatmap_layer0.png ADDED
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/hijack_sample_count_heatmap_layer1.png ADDED

Git LFS Details

  • SHA256: 6b0c1f28efc8e3d8ce28ab5868fca7fb41f4ee2f756c063e564c05daa935a2c9
  • Pointer size: 131 Bytes
  • Size of remote file: 100 kB
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer0.png ADDED
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer0_asym_ub60_lb60.png ADDED

Git LFS Details

  • SHA256: bd63c5a606a82715f394bccd4305470c8f7ffefd5533f28ead4c2a4b12f707b3
  • Pointer size: 131 Bytes
  • Size of remote file: 123 kB
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer0_ub10.png ADDED
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer0_ub15.png ADDED
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer0_ub20.png ADDED
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer0_ub30.png ADDED
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer0_ub50.png ADDED
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer0_ub60.png ADDED
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer0_ub60_high.png ADDED
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer1.png ADDED
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer1_ub10.png ADDED
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer1_ub15.png ADDED
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer1_ub20.png ADDED
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer1_ub30.png ADDED
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer1_ub50.png ADDED
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer1_ub60.png ADDED
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intensity_layer1_ub60_high.png ADDED
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intervention_pernumber_random_layer0.png ADDED

Git LFS Details

  • SHA256: 0f067dad4da72649cb8a137002051042df7001f419e0c76cb8be47ae0241e3ab
  • Pointer size: 131 Bytes
  • Size of remote file: 340 kB
outputs/plots_V256_B16_LR3e-2_MI1000000_E64_H1_L2_ds1337_is1337_ckpt1000000/intervention_pernumber_separator_layer0.png ADDED

Git LFS Details

  • SHA256: 954494723b1475b0fb7d55f56f35e1a1393e68a3efada62959188705e2b238c3
  • Pointer size: 131 Bytes
  • Size of remote file: 104 kB
outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/ablation_accuracy.png ADDED
outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/ablation_conditional_accuracy.png ADDED

Git LFS Details

  • SHA256: 198a1f5de4526f3811c7b77f3c29f45fa1269150c1a13181888a8a481e592319
  • Pointer size: 131 Bytes
  • Size of remote file: 161 kB
outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/ablation_per_position.png ADDED

Git LFS Details

  • SHA256: 50a8324c292b9013c133d054290d55165cd96a6744117b256b13cfcad0121a3f
  • Pointer size: 131 Bytes
  • Size of remote file: 149 kB
outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/attn_heatmaps.png ADDED

Git LFS Details

  • SHA256: 3f4db97c84a4a4fa5167788465b10623c2fb0b6fd077fcc5f3413c0715d17634
  • Pointer size: 131 Bytes
  • Size of remote file: 117 kB
outputs/plots_V256_B16_LR3e-2_MI100000_E64_H1_L2_ds1337_is1337_ckpt100000/avg_attn_by_number_layer0.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eebc87b61451f35001b6cb56a92cb9129d97f7bed18c2b02e539917f108967b3
3
+ size 526858