saebench / sparse_probing /gemma-2-2b-res-matryoshka-dc_blocks.5.hook_resid_post_eval_results.json
chanind's picture
Upload sparse_probing/gemma-2-2b-res-matryoshka-dc_blocks.5.hook_resid_post_eval_results.json with huggingface_hub
b6959d5 verified
{
"eval_type_id": "sparse_probing",
"eval_config": {
"random_seed": 42,
"dataset_names": [
"LabHC/bias_in_bios_class_set1",
"LabHC/bias_in_bios_class_set2",
"LabHC/bias_in_bios_class_set3",
"canrager/amazon_reviews_mcauley_1and5",
"canrager/amazon_reviews_mcauley_1and5_sentiment",
"codeparrot/github-code",
"fancyzhx/ag_news",
"Helsinki-NLP/europarl"
],
"probe_train_set_size": 4000,
"probe_test_set_size": 1000,
"context_length": 128,
"sae_batch_size": 125,
"llm_batch_size": 32,
"llm_dtype": "bfloat16",
"model_name": "gemma-2-2b",
"k_values": [
1,
2,
5
],
"lower_vram_usage": false
},
"eval_id": "23b811f8-60d2-4669-9569-74fb61ee47a0",
"datetime_epoch_millis": 1745753778261,
"eval_result_metrics": {
"llm": {
"llm_test_accuracy": 0.94785629324615,
"llm_top_1_test_accuracy": 0.679,
"llm_top_2_test_accuracy": 0.7241375,
"llm_top_5_test_accuracy": 0.7792437499999999,
"llm_top_10_test_accuracy": null,
"llm_top_20_test_accuracy": null,
"llm_top_50_test_accuracy": null,
"llm_top_100_test_accuracy": null
},
"sae": {
"sae_test_accuracy": 0.9424250468611718,
"sae_top_1_test_accuracy": 0.761725,
"sae_top_2_test_accuracy": 0.7938937500000001,
"sae_top_5_test_accuracy": 0.8631499999999999,
"sae_top_10_test_accuracy": null,
"sae_top_20_test_accuracy": null,
"sae_top_50_test_accuracy": null,
"sae_top_100_test_accuracy": null
}
},
"eval_result_details": [
{
"dataset_name": "LabHC/bias_in_bios_class_set1_results",
"llm_test_accuracy": 0.9626000404357911,
"llm_top_1_test_accuracy": 0.6714,
"llm_top_2_test_accuracy": 0.6866,
"llm_top_5_test_accuracy": 0.7459999999999999,
"llm_top_10_test_accuracy": null,
"llm_top_20_test_accuracy": null,
"llm_top_50_test_accuracy": null,
"llm_top_100_test_accuracy": null,
"sae_test_accuracy": 0.9566000461578369,
"sae_top_1_test_accuracy": 0.7922,
"sae_top_2_test_accuracy": 0.8124,
"sae_top_5_test_accuracy": 0.9067999999999999,
"sae_top_10_test_accuracy": null,
"sae_top_20_test_accuracy": null,
"sae_top_50_test_accuracy": null,
"sae_top_100_test_accuracy": null
},
{
"dataset_name": "LabHC/bias_in_bios_class_set2_results",
"llm_test_accuracy": 0.9476000547409058,
"llm_top_1_test_accuracy": 0.6726,
"llm_top_2_test_accuracy": 0.7218,
"llm_top_5_test_accuracy": 0.7754,
"llm_top_10_test_accuracy": null,
"llm_top_20_test_accuracy": null,
"llm_top_50_test_accuracy": null,
"llm_top_100_test_accuracy": null,
"sae_test_accuracy": 0.9378000497817993,
"sae_top_1_test_accuracy": 0.7180000000000001,
"sae_top_2_test_accuracy": 0.7326,
"sae_top_5_test_accuracy": 0.8038000000000001,
"sae_top_10_test_accuracy": null,
"sae_top_20_test_accuracy": null,
"sae_top_50_test_accuracy": null,
"sae_top_100_test_accuracy": null
},
{
"dataset_name": "LabHC/bias_in_bios_class_set3_results",
"llm_test_accuracy": 0.9228000402450561,
"llm_top_1_test_accuracy": 0.6772,
"llm_top_2_test_accuracy": 0.708,
"llm_top_5_test_accuracy": 0.7418,
"llm_top_10_test_accuracy": null,
"llm_top_20_test_accuracy": null,
"llm_top_50_test_accuracy": null,
"llm_top_100_test_accuracy": null,
"sae_test_accuracy": 0.920400059223175,
"sae_top_1_test_accuracy": 0.7306000000000001,
"sae_top_2_test_accuracy": 0.7992,
"sae_top_5_test_accuracy": 0.835,
"sae_top_10_test_accuracy": null,
"sae_top_20_test_accuracy": null,
"sae_top_50_test_accuracy": null,
"sae_top_100_test_accuracy": null
},
{
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_results",
"llm_test_accuracy": 0.9000000357627869,
"llm_top_1_test_accuracy": 0.615,
"llm_top_2_test_accuracy": 0.6302000000000001,
"llm_top_5_test_accuracy": 0.6941999999999999,
"llm_top_10_test_accuracy": null,
"llm_top_20_test_accuracy": null,
"llm_top_50_test_accuracy": null,
"llm_top_100_test_accuracy": null,
"sae_test_accuracy": 0.8984000444412231,
"sae_top_1_test_accuracy": 0.692,
"sae_top_2_test_accuracy": 0.7429999999999999,
"sae_top_5_test_accuracy": 0.8023999999999999,
"sae_top_10_test_accuracy": null,
"sae_top_20_test_accuracy": null,
"sae_top_50_test_accuracy": null,
"sae_top_100_test_accuracy": null
},
{
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_sentiment_results",
"llm_test_accuracy": 0.9340000450611115,
"llm_top_1_test_accuracy": 0.628,
"llm_top_2_test_accuracy": 0.686,
"llm_top_5_test_accuracy": 0.738,
"llm_top_10_test_accuracy": null,
"llm_top_20_test_accuracy": null,
"llm_top_50_test_accuracy": null,
"llm_top_100_test_accuracy": null,
"sae_test_accuracy": 0.9130000472068787,
"sae_top_1_test_accuracy": 0.755,
"sae_top_2_test_accuracy": 0.776,
"sae_top_5_test_accuracy": 0.842,
"sae_top_10_test_accuracy": null,
"sae_top_20_test_accuracy": null,
"sae_top_50_test_accuracy": null,
"sae_top_100_test_accuracy": null
},
{
"dataset_name": "codeparrot/github-code_results",
"llm_test_accuracy": 0.9710000514984131,
"llm_top_1_test_accuracy": 0.6568,
"llm_top_2_test_accuracy": 0.708,
"llm_top_5_test_accuracy": 0.7871999999999999,
"llm_top_10_test_accuracy": null,
"llm_top_20_test_accuracy": null,
"llm_top_50_test_accuracy": null,
"llm_top_100_test_accuracy": null,
"sae_test_accuracy": 0.9690000414848328,
"sae_top_1_test_accuracy": 0.6876,
"sae_top_2_test_accuracy": 0.6894000000000001,
"sae_top_5_test_accuracy": 0.843,
"sae_top_10_test_accuracy": null,
"sae_top_20_test_accuracy": null,
"sae_top_50_test_accuracy": null,
"sae_top_100_test_accuracy": null
},
{
"dataset_name": "fancyzhx/ag_news_results",
"llm_test_accuracy": 0.9452500492334366,
"llm_top_1_test_accuracy": 0.7120000000000001,
"llm_top_2_test_accuracy": 0.7595,
"llm_top_5_test_accuracy": 0.80375,
"llm_top_10_test_accuracy": null,
"llm_top_20_test_accuracy": null,
"llm_top_50_test_accuracy": null,
"llm_top_100_test_accuracy": null,
"sae_test_accuracy": 0.9450000524520874,
"sae_top_1_test_accuracy": 0.755,
"sae_top_2_test_accuracy": 0.80175,
"sae_top_5_test_accuracy": 0.8739999999999999,
"sae_top_10_test_accuracy": null,
"sae_top_20_test_accuracy": null,
"sae_top_50_test_accuracy": null,
"sae_top_100_test_accuracy": null
},
{
"dataset_name": "Helsinki-NLP/europarl_results",
"llm_test_accuracy": 0.9996000289916992,
"llm_top_1_test_accuracy": 0.799,
"llm_top_2_test_accuracy": 0.893,
"llm_top_5_test_accuracy": 0.9475999999999999,
"llm_top_10_test_accuracy": null,
"llm_top_20_test_accuracy": null,
"llm_top_50_test_accuracy": null,
"llm_top_100_test_accuracy": null,
"sae_test_accuracy": 0.9992000341415406,
"sae_top_1_test_accuracy": 0.9634,
"sae_top_2_test_accuracy": 0.9968,
"sae_top_5_test_accuracy": 0.9982,
"sae_top_10_test_accuracy": null,
"sae_top_20_test_accuracy": null,
"sae_top_50_test_accuracy": null,
"sae_top_100_test_accuracy": null
}
],
"sae_bench_commit_hash": "Unknown",
"sae_lens_id": "blocks.5.hook_resid_post",
"sae_lens_release_id": "gemma-2-2b-res-matryoshka-dc",
"sae_lens_version": "5.9.1",
"sae_cfg_dict": {
"architecture": "jumprelu",
"d_in": 2304,
"d_sae": 32768,
"activation_fn_str": "relu",
"apply_b_dec_to_input": true,
"finetuning_scaling_factor": false,
"context_size": 1024,
"model_name": "gemma-2-2b",
"hook_name": "blocks.5.hook_resid_post",
"hook_layer": 5,
"hook_head_index": null,
"prepend_bos": true,
"dataset_path": "chanind/pile-uncopyrighted-gemma-1024-abbrv-1B",
"dataset_trust_remote_code": true,
"normalize_activations": "none",
"dtype": "torch.bfloat16",
"device": "cuda",
"sae_lens_training_version": "5.5.1",
"activation_fn_kwargs": {
"k": 40
},
"neuronpedia_id": null,
"model_from_pretrained_kwargs": {
"center_writing_weights": false
},
"seqpos_slice": [
null
]
},
"eval_result_unstructured": {
"LabHC/bias_in_bios_class_set1_results": {
"sae_test_accuracy": {
"0": 0.937000036239624,
"1": 0.9550000429153442,
"2": 0.940000057220459,
"6": 0.9790000319480896,
"9": 0.9720000624656677
},
"llm_test_accuracy": {
"0": 0.9360000491142273,
"1": 0.9650000333786011,
"2": 0.9450000524520874,
"6": 0.9940000176429749,
"9": 0.9730000495910645
},
"llm_top_1_test_accuracy": {
"0": 0.595,
"1": 0.64,
"2": 0.67,
"6": 0.765,
"9": 0.687
},
"llm_top_2_test_accuracy": {
"0": 0.6,
"1": 0.619,
"2": 0.674,
"6": 0.809,
"9": 0.731
},
"llm_top_5_test_accuracy": {
"0": 0.697,
"1": 0.733,
"2": 0.72,
"6": 0.825,
"9": 0.755
},
"sae_top_1_test_accuracy": {
"0": 0.78,
"1": 0.614,
"2": 0.862,
"6": 0.965,
"9": 0.74
},
"sae_top_2_test_accuracy": {
"0": 0.799,
"1": 0.661,
"2": 0.862,
"6": 0.971,
"9": 0.769
},
"sae_top_5_test_accuracy": {
"0": 0.874,
"1": 0.834,
"2": 0.907,
"6": 0.976,
"9": 0.943
}
},
"LabHC/bias_in_bios_class_set2_results": {
"sae_test_accuracy": {
"11": 0.9480000734329224,
"13": 0.9450000524520874,
"14": 0.9390000700950623,
"18": 0.9000000357627869,
"19": 0.9570000171661377
},
"llm_test_accuracy": {
"11": 0.9580000638961792,
"13": 0.9440000653266907,
"14": 0.9510000348091125,
"18": 0.9220000505447388,
"19": 0.9630000591278076
},
"llm_top_1_test_accuracy": {
"11": 0.581,
"13": 0.693,
"14": 0.643,
"18": 0.687,
"19": 0.759
},
"llm_top_2_test_accuracy": {
"11": 0.76,
"13": 0.698,
"14": 0.664,
"18": 0.713,
"19": 0.774
},
"llm_top_5_test_accuracy": {
"11": 0.862,
"13": 0.769,
"14": 0.698,
"18": 0.727,
"19": 0.821
},
"sae_top_1_test_accuracy": {
"11": 0.819,
"13": 0.697,
"14": 0.637,
"18": 0.675,
"19": 0.762
},
"sae_top_2_test_accuracy": {
"11": 0.841,
"13": 0.732,
"14": 0.644,
"18": 0.702,
"19": 0.744
},
"sae_top_5_test_accuracy": {
"11": 0.861,
"13": 0.728,
"14": 0.809,
"18": 0.74,
"19": 0.881
}
},
"LabHC/bias_in_bios_class_set3_results": {
"sae_test_accuracy": {
"20": 0.9490000605583191,
"21": 0.9110000729560852,
"22": 0.9110000729560852,
"25": 0.9540000557899475,
"26": 0.8770000338554382
},
"llm_test_accuracy": {
"20": 0.9440000653266907,
"21": 0.9130000472068787,
"22": 0.9180000424385071,
"25": 0.956000030040741,
"26": 0.8830000162124634
},
"llm_top_1_test_accuracy": {
"20": 0.652,
"21": 0.718,
"22": 0.623,
"25": 0.737,
"26": 0.656
},
"llm_top_2_test_accuracy": {
"20": 0.784,
"21": 0.759,
"22": 0.606,
"25": 0.737,
"26": 0.654
},
"llm_top_5_test_accuracy": {
"20": 0.821,
"21": 0.769,
"22": 0.706,
"25": 0.752,
"26": 0.661
},
"sae_top_1_test_accuracy": {
"20": 0.903,
"21": 0.592,
"22": 0.891,
"25": 0.663,
"26": 0.604
},
"sae_top_2_test_accuracy": {
"20": 0.91,
"21": 0.691,
"22": 0.884,
"25": 0.867,
"26": 0.644
},
"sae_top_5_test_accuracy": {
"20": 0.915,
"21": 0.737,
"22": 0.877,
"25": 0.891,
"26": 0.755
}
},
"canrager/amazon_reviews_mcauley_1and5_results": {
"sae_test_accuracy": {
"1": 0.9290000200271606,
"2": 0.9230000376701355,
"3": 0.8840000629425049,
"5": 0.9040000438690186,
"6": 0.8520000576972961
},
"llm_test_accuracy": {
"1": 0.9240000247955322,
"2": 0.9200000166893005,
"3": 0.9040000438690186,
"5": 0.9030000567436218,
"6": 0.8490000367164612
},
"llm_top_1_test_accuracy": {
"1": 0.711,
"2": 0.595,
"3": 0.604,
"5": 0.558,
"6": 0.607
},
"llm_top_2_test_accuracy": {
"1": 0.695,
"2": 0.684,
"3": 0.604,
"5": 0.558,
"6": 0.61
},
"llm_top_5_test_accuracy": {
"1": 0.755,
"2": 0.742,
"3": 0.625,
"5": 0.658,
"6": 0.691
},
"sae_top_1_test_accuracy": {
"1": 0.616,
"2": 0.825,
"3": 0.623,
"5": 0.808,
"6": 0.588
},
"sae_top_2_test_accuracy": {
"1": 0.857,
"2": 0.838,
"3": 0.615,
"5": 0.813,
"6": 0.592
},
"sae_top_5_test_accuracy": {
"1": 0.888,
"2": 0.865,
"3": 0.728,
"5": 0.845,
"6": 0.686
}
},
"canrager/amazon_reviews_mcauley_1and5_sentiment_results": {
"sae_test_accuracy": {
"1.0": 0.9150000214576721,
"5.0": 0.9110000729560852
},
"llm_test_accuracy": {
"1.0": 0.9350000619888306,
"5.0": 0.9330000281333923
},
"llm_top_1_test_accuracy": {
"1.0": 0.628,
"5.0": 0.628
},
"llm_top_2_test_accuracy": {
"1.0": 0.686,
"5.0": 0.686
},
"llm_top_5_test_accuracy": {
"1.0": 0.738,
"5.0": 0.738
},
"sae_top_1_test_accuracy": {
"1.0": 0.755,
"5.0": 0.755
},
"sae_top_2_test_accuracy": {
"1.0": 0.776,
"5.0": 0.776
},
"sae_top_5_test_accuracy": {
"1.0": 0.842,
"5.0": 0.842
}
},
"codeparrot/github-code_results": {
"sae_test_accuracy": {
"C": 0.9500000476837158,
"Python": 0.984000027179718,
"HTML": 0.9890000224113464,
"Java": 0.9640000462532043,
"PHP": 0.9580000638961792
},
"llm_test_accuracy": {
"C": 0.9600000381469727,
"Python": 0.987000048160553,
"HTML": 0.9900000691413879,
"Java": 0.9620000720024109,
"PHP": 0.956000030040741
},
"llm_top_1_test_accuracy": {
"C": 0.643,
"Python": 0.599,
"HTML": 0.796,
"Java": 0.641,
"PHP": 0.605
},
"llm_top_2_test_accuracy": {
"C": 0.699,
"Python": 0.615,
"HTML": 0.935,
"Java": 0.665,
"PHP": 0.626
},
"llm_top_5_test_accuracy": {
"C": 0.752,
"Python": 0.77,
"HTML": 0.955,
"Java": 0.764,
"PHP": 0.695
},
"sae_top_1_test_accuracy": {
"C": 0.64,
"Python": 0.604,
"HTML": 0.931,
"Java": 0.634,
"PHP": 0.629
},
"sae_top_2_test_accuracy": {
"C": 0.638,
"Python": 0.635,
"HTML": 0.925,
"Java": 0.637,
"PHP": 0.612
},
"sae_top_5_test_accuracy": {
"C": 0.688,
"Python": 0.927,
"HTML": 0.946,
"Java": 0.748,
"PHP": 0.906
}
},
"fancyzhx/ag_news_results": {
"sae_test_accuracy": {
"0": 0.940000057220459,
"1": 0.9740000367164612,
"2": 0.9160000681877136,
"3": 0.9500000476837158
},
"llm_test_accuracy": {
"0": 0.9310000538825989,
"1": 0.9810000658035278,
"2": 0.9290000200271606,
"3": 0.940000057220459
},
"llm_top_1_test_accuracy": {
"0": 0.744,
"1": 0.783,
"2": 0.639,
"3": 0.682
},
"llm_top_2_test_accuracy": {
"0": 0.738,
"1": 0.806,
"2": 0.687,
"3": 0.807
},
"llm_top_5_test_accuracy": {
"0": 0.776,
"1": 0.877,
"2": 0.742,
"3": 0.82
},
"sae_top_1_test_accuracy": {
"0": 0.735,
"1": 0.856,
"2": 0.732,
"3": 0.697
},
"sae_top_2_test_accuracy": {
"0": 0.749,
"1": 0.863,
"2": 0.836,
"3": 0.759
},
"sae_top_5_test_accuracy": {
"0": 0.861,
"1": 0.943,
"2": 0.848,
"3": 0.844
}
},
"Helsinki-NLP/europarl_results": {
"sae_test_accuracy": {
"en": 0.9980000257492065,
"fr": 1.0,
"de": 1.0,
"es": 0.999000072479248,
"nl": 0.999000072479248
},
"llm_test_accuracy": {
"en": 0.999000072479248,
"fr": 1.0,
"de": 1.0,
"es": 0.999000072479248,
"nl": 1.0
},
"llm_top_1_test_accuracy": {
"en": 0.894,
"fr": 0.634,
"de": 0.75,
"es": 0.88,
"nl": 0.837
},
"llm_top_2_test_accuracy": {
"en": 0.899,
"fr": 0.905,
"de": 0.83,
"es": 0.958,
"nl": 0.873
},
"llm_top_5_test_accuracy": {
"en": 0.984,
"fr": 0.969,
"de": 0.887,
"es": 0.978,
"nl": 0.92
},
"sae_top_1_test_accuracy": {
"en": 0.997,
"fr": 0.996,
"de": 0.986,
"es": 0.998,
"nl": 0.84
},
"sae_top_2_test_accuracy": {
"en": 0.998,
"fr": 0.997,
"de": 0.993,
"es": 0.998,
"nl": 0.998
},
"sae_top_5_test_accuracy": {
"en": 0.999,
"fr": 0.998,
"de": 0.995,
"es": 0.999,
"nl": 1.0
}
}
}
}