saebench / sparse_probing /gemma-2-2b-res-snap-matryoshka-dc_blocks.21.hook_resid_post_eval_results.json
chanind's picture
Upload sparse_probing/gemma-2-2b-res-snap-matryoshka-dc_blocks.21.hook_resid_post_eval_results.json with huggingface_hub
2281d16 verified
{
"eval_type_id": "sparse_probing",
"eval_config": {
"random_seed": 42,
"dataset_names": [
"LabHC/bias_in_bios_class_set1",
"LabHC/bias_in_bios_class_set2",
"LabHC/bias_in_bios_class_set3",
"canrager/amazon_reviews_mcauley_1and5",
"canrager/amazon_reviews_mcauley_1and5_sentiment",
"codeparrot/github-code",
"fancyzhx/ag_news",
"Helsinki-NLP/europarl"
],
"probe_train_set_size": 4000,
"probe_test_set_size": 1000,
"context_length": 128,
"sae_batch_size": 125,
"llm_batch_size": 32,
"llm_dtype": "bfloat16",
"model_name": "gemma-2-2b",
"k_values": [
1,
2,
5
],
"lower_vram_usage": false
},
"eval_id": "acf88c24-09bd-4561-9b58-1bd96edb30c6",
"datetime_epoch_millis": 1745629709739,
"eval_result_metrics": {
"llm": {
"llm_test_accuracy": 0.9593437947332858,
"llm_top_1_test_accuracy": 0.70410625,
"llm_top_2_test_accuracy": 0.7557,
"llm_top_5_test_accuracy": 0.8173062500000001,
"llm_top_10_test_accuracy": null,
"llm_top_20_test_accuracy": null,
"llm_top_50_test_accuracy": null,
"llm_top_100_test_accuracy": null
},
"sae": {
"sae_test_accuracy": 0.9573500454425812,
"sae_top_1_test_accuracy": 0.80261875,
"sae_top_2_test_accuracy": 0.8779749999999998,
"sae_top_5_test_accuracy": 0.9056250000000001,
"sae_top_10_test_accuracy": null,
"sae_top_20_test_accuracy": null,
"sae_top_50_test_accuracy": null,
"sae_top_100_test_accuracy": null
}
},
"eval_result_details": [
{
"dataset_name": "LabHC/bias_in_bios_class_set1_results",
"llm_test_accuracy": 0.9682000517845154,
"llm_top_1_test_accuracy": 0.6666000000000001,
"llm_top_2_test_accuracy": 0.7152000000000001,
"llm_top_5_test_accuracy": 0.7978,
"llm_top_10_test_accuracy": null,
"llm_top_20_test_accuracy": null,
"llm_top_50_test_accuracy": null,
"llm_top_100_test_accuracy": null,
"sae_test_accuracy": 0.9646000504493714,
"sae_top_1_test_accuracy": 0.876,
"sae_top_2_test_accuracy": 0.8783999999999998,
"sae_top_5_test_accuracy": 0.9312000000000001,
"sae_top_10_test_accuracy": null,
"sae_top_20_test_accuracy": null,
"sae_top_50_test_accuracy": null,
"sae_top_100_test_accuracy": null
},
{
"dataset_name": "LabHC/bias_in_bios_class_set2_results",
"llm_test_accuracy": 0.9588000416755676,
"llm_top_1_test_accuracy": 0.6688000000000001,
"llm_top_2_test_accuracy": 0.685,
"llm_top_5_test_accuracy": 0.7556,
"llm_top_10_test_accuracy": null,
"llm_top_20_test_accuracy": null,
"llm_top_50_test_accuracy": null,
"llm_top_100_test_accuracy": null,
"sae_test_accuracy": 0.9486000418663025,
"sae_top_1_test_accuracy": 0.7762,
"sae_top_2_test_accuracy": 0.8555999999999999,
"sae_top_5_test_accuracy": 0.8937999999999999,
"sae_top_10_test_accuracy": null,
"sae_top_20_test_accuracy": null,
"sae_top_50_test_accuracy": null,
"sae_top_100_test_accuracy": null
},
{
"dataset_name": "LabHC/bias_in_bios_class_set3_results",
"llm_test_accuracy": 0.9284000515937805,
"llm_top_1_test_accuracy": 0.6984,
"llm_top_2_test_accuracy": 0.7352000000000001,
"llm_top_5_test_accuracy": 0.7849999999999999,
"llm_top_10_test_accuracy": null,
"llm_top_20_test_accuracy": null,
"llm_top_50_test_accuracy": null,
"llm_top_100_test_accuracy": null,
"sae_test_accuracy": 0.9282000541687012,
"sae_top_1_test_accuracy": 0.8054,
"sae_top_2_test_accuracy": 0.8240000000000001,
"sae_top_5_test_accuracy": 0.8680000000000001,
"sae_top_10_test_accuracy": null,
"sae_top_20_test_accuracy": null,
"sae_top_50_test_accuracy": null,
"sae_top_100_test_accuracy": null
},
{
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_results",
"llm_test_accuracy": 0.9332000494003296,
"llm_top_1_test_accuracy": 0.6744,
"llm_top_2_test_accuracy": 0.7394000000000001,
"llm_top_5_test_accuracy": 0.8013999999999999,
"llm_top_10_test_accuracy": null,
"llm_top_20_test_accuracy": null,
"llm_top_50_test_accuracy": null,
"llm_top_100_test_accuracy": null,
"sae_test_accuracy": 0.9358000516891479,
"sae_top_1_test_accuracy": 0.8099999999999999,
"sae_top_2_test_accuracy": 0.8385999999999999,
"sae_top_5_test_accuracy": 0.8469999999999999,
"sae_top_10_test_accuracy": null,
"sae_top_20_test_accuracy": null,
"sae_top_50_test_accuracy": null,
"sae_top_100_test_accuracy": null
},
{
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_sentiment_results",
"llm_test_accuracy": 0.9670000374317169,
"llm_top_1_test_accuracy": 0.685,
"llm_top_2_test_accuracy": 0.749,
"llm_top_5_test_accuracy": 0.79,
"llm_top_10_test_accuracy": null,
"llm_top_20_test_accuracy": null,
"llm_top_50_test_accuracy": null,
"llm_top_100_test_accuracy": null,
"sae_test_accuracy": 0.9625000357627869,
"sae_top_1_test_accuracy": 0.602,
"sae_top_2_test_accuracy": 0.878,
"sae_top_5_test_accuracy": 0.885,
"sae_top_10_test_accuracy": null,
"sae_top_20_test_accuracy": null,
"sae_top_50_test_accuracy": null,
"sae_top_100_test_accuracy": null
},
{
"dataset_name": "codeparrot/github-code_results",
"llm_test_accuracy": 0.9688000440597534,
"llm_top_1_test_accuracy": 0.6298,
"llm_top_2_test_accuracy": 0.6841999999999999,
"llm_top_5_test_accuracy": 0.7826,
"llm_top_10_test_accuracy": null,
"llm_top_20_test_accuracy": null,
"llm_top_50_test_accuracy": null,
"llm_top_100_test_accuracy": null,
"sae_test_accuracy": 0.9712000489234924,
"sae_top_1_test_accuracy": 0.7732,
"sae_top_2_test_accuracy": 0.8728,
"sae_top_5_test_accuracy": 0.9274000000000001,
"sae_top_10_test_accuracy": null,
"sae_top_20_test_accuracy": null,
"sae_top_50_test_accuracy": null,
"sae_top_100_test_accuracy": null
},
{
"dataset_name": "fancyzhx/ag_news_results",
"llm_test_accuracy": 0.9507500529289246,
"llm_top_1_test_accuracy": 0.65025,
"llm_top_2_test_accuracy": 0.747,
"llm_top_5_test_accuracy": 0.82725,
"llm_top_10_test_accuracy": null,
"llm_top_20_test_accuracy": null,
"llm_top_50_test_accuracy": null,
"llm_top_100_test_accuracy": null,
"sae_test_accuracy": 0.9485000371932983,
"sae_top_1_test_accuracy": 0.81775,
"sae_top_2_test_accuracy": 0.882,
"sae_top_5_test_accuracy": 0.893,
"sae_top_10_test_accuracy": null,
"sae_top_20_test_accuracy": null,
"sae_top_50_test_accuracy": null,
"sae_top_100_test_accuracy": null
},
{
"dataset_name": "Helsinki-NLP/europarl_results",
"llm_test_accuracy": 0.9996000289916992,
"llm_top_1_test_accuracy": 0.9596,
"llm_top_2_test_accuracy": 0.9905999999999999,
"llm_top_5_test_accuracy": 0.9987999999999999,
"llm_top_10_test_accuracy": null,
"llm_top_20_test_accuracy": null,
"llm_top_50_test_accuracy": null,
"llm_top_100_test_accuracy": null,
"sae_test_accuracy": 0.9994000434875489,
"sae_top_1_test_accuracy": 0.9603999999999999,
"sae_top_2_test_accuracy": 0.9944,
"sae_top_5_test_accuracy": 0.9996,
"sae_top_10_test_accuracy": null,
"sae_top_20_test_accuracy": null,
"sae_top_50_test_accuracy": null,
"sae_top_100_test_accuracy": null
}
],
"sae_bench_commit_hash": "Unknown",
"sae_lens_id": "blocks.21.hook_resid_post",
"sae_lens_release_id": "gemma-2-2b-res-snap-matryoshka-dc",
"sae_lens_version": "5.9.1",
"sae_cfg_dict": {
"architecture": "jumprelu",
"d_in": 2304,
"d_sae": 32768,
"activation_fn_str": "relu",
"apply_b_dec_to_input": true,
"finetuning_scaling_factor": false,
"context_size": 1024,
"model_name": "gemma-2-2b",
"hook_name": "blocks.21.hook_resid_post",
"hook_layer": 21,
"hook_head_index": null,
"prepend_bos": true,
"dataset_path": "chanind/pile-uncopyrighted-gemma-1024-abbrv-1B",
"dataset_trust_remote_code": true,
"normalize_activations": "none",
"dtype": "torch.bfloat16",
"device": "cuda",
"sae_lens_training_version": "5.5.1",
"activation_fn_kwargs": {
"k": 40
},
"neuronpedia_id": null,
"model_from_pretrained_kwargs": {
"center_writing_weights": false
},
"seqpos_slice": [
null
]
},
"eval_result_unstructured": {
"LabHC/bias_in_bios_class_set1_results": {
"sae_test_accuracy": {
"0": 0.9450000524520874,
"1": 0.9630000591278076,
"2": 0.9460000395774841,
"6": 0.9900000691413879,
"9": 0.9790000319480896
},
"llm_test_accuracy": {
"0": 0.9510000348091125,
"1": 0.9630000591278076,
"2": 0.9580000638961792,
"6": 0.9880000352859497,
"9": 0.9810000658035278
},
"llm_top_1_test_accuracy": {
"0": 0.578,
"1": 0.658,
"2": 0.691,
"6": 0.744,
"9": 0.662
},
"llm_top_2_test_accuracy": {
"0": 0.567,
"1": 0.67,
"2": 0.786,
"6": 0.807,
"9": 0.746
},
"llm_top_5_test_accuracy": {
"0": 0.619,
"1": 0.708,
"2": 0.828,
"6": 0.906,
"9": 0.928
},
"sae_top_1_test_accuracy": {
"0": 0.777,
"1": 0.795,
"2": 0.879,
"6": 0.98,
"9": 0.949
},
"sae_top_2_test_accuracy": {
"0": 0.768,
"1": 0.802,
"2": 0.879,
"6": 0.989,
"9": 0.954
},
"sae_top_5_test_accuracy": {
"0": 0.888,
"1": 0.914,
"2": 0.909,
"6": 0.991,
"9": 0.954
}
},
"LabHC/bias_in_bios_class_set2_results": {
"sae_test_accuracy": {
"11": 0.9580000638961792,
"13": 0.9430000185966492,
"14": 0.9600000381469727,
"18": 0.9320000410079956,
"19": 0.9500000476837158
},
"llm_test_accuracy": {
"11": 0.9660000205039978,
"13": 0.956000030040741,
"14": 0.9690000414848328,
"18": 0.9410000443458557,
"19": 0.9620000720024109
},
"llm_top_1_test_accuracy": {
"11": 0.558,
"13": 0.666,
"14": 0.647,
"18": 0.695,
"19": 0.778
},
"llm_top_2_test_accuracy": {
"11": 0.598,
"13": 0.675,
"14": 0.656,
"18": 0.693,
"19": 0.803
},
"llm_top_5_test_accuracy": {
"11": 0.83,
"13": 0.756,
"14": 0.658,
"18": 0.741,
"19": 0.793
},
"sae_top_1_test_accuracy": {
"11": 0.731,
"13": 0.674,
"14": 0.909,
"18": 0.719,
"19": 0.848
},
"sae_top_2_test_accuracy": {
"11": 0.858,
"13": 0.78,
"14": 0.917,
"18": 0.858,
"19": 0.865
},
"sae_top_5_test_accuracy": {
"11": 0.934,
"13": 0.804,
"14": 0.921,
"18": 0.92,
"19": 0.89
}
},
"LabHC/bias_in_bios_class_set3_results": {
"sae_test_accuracy": {
"20": 0.9470000267028809,
"21": 0.9160000681877136,
"22": 0.9250000715255737,
"25": 0.9640000462532043,
"26": 0.8890000581741333
},
"llm_test_accuracy": {
"20": 0.9610000252723694,
"21": 0.921000063419342,
"22": 0.9160000681877136,
"25": 0.9600000381469727,
"26": 0.8840000629425049
},
"llm_top_1_test_accuracy": {
"20": 0.725,
"21": 0.755,
"22": 0.652,
"25": 0.714,
"26": 0.646
},
"llm_top_2_test_accuracy": {
"20": 0.81,
"21": 0.775,
"22": 0.694,
"25": 0.738,
"26": 0.659
},
"llm_top_5_test_accuracy": {
"20": 0.824,
"21": 0.807,
"22": 0.801,
"25": 0.787,
"26": 0.706
},
"sae_top_1_test_accuracy": {
"20": 0.852,
"21": 0.797,
"22": 0.858,
"25": 0.878,
"26": 0.642
},
"sae_top_2_test_accuracy": {
"20": 0.869,
"21": 0.807,
"22": 0.851,
"25": 0.861,
"26": 0.732
},
"sae_top_5_test_accuracy": {
"20": 0.928,
"21": 0.85,
"22": 0.861,
"25": 0.898,
"26": 0.803
}
},
"canrager/amazon_reviews_mcauley_1and5_results": {
"sae_test_accuracy": {
"1": 0.9610000252723694,
"2": 0.9410000443458557,
"3": 0.940000057220459,
"5": 0.9440000653266907,
"6": 0.893000066280365
},
"llm_test_accuracy": {
"1": 0.9630000591278076,
"2": 0.940000057220459,
"3": 0.937000036239624,
"5": 0.9330000281333923,
"6": 0.893000066280365
},
"llm_top_1_test_accuracy": {
"1": 0.688,
"2": 0.795,
"3": 0.63,
"5": 0.566,
"6": 0.693
},
"llm_top_2_test_accuracy": {
"1": 0.804,
"2": 0.803,
"3": 0.642,
"5": 0.776,
"6": 0.672
},
"llm_top_5_test_accuracy": {
"1": 0.824,
"2": 0.886,
"3": 0.762,
"5": 0.822,
"6": 0.713
},
"sae_top_1_test_accuracy": {
"1": 0.888,
"2": 0.864,
"3": 0.683,
"5": 0.864,
"6": 0.751
},
"sae_top_2_test_accuracy": {
"1": 0.906,
"2": 0.868,
"3": 0.753,
"5": 0.896,
"6": 0.77
},
"sae_top_5_test_accuracy": {
"1": 0.924,
"2": 0.891,
"3": 0.772,
"5": 0.888,
"6": 0.76
}
},
"canrager/amazon_reviews_mcauley_1and5_sentiment_results": {
"sae_test_accuracy": {
"1.0": 0.9640000462532043,
"5.0": 0.9610000252723694
},
"llm_test_accuracy": {
"1.0": 0.9650000333786011,
"5.0": 0.9690000414848328
},
"llm_top_1_test_accuracy": {
"1.0": 0.685,
"5.0": 0.685
},
"llm_top_2_test_accuracy": {
"1.0": 0.749,
"5.0": 0.749
},
"llm_top_5_test_accuracy": {
"1.0": 0.79,
"5.0": 0.79
},
"sae_top_1_test_accuracy": {
"1.0": 0.602,
"5.0": 0.602
},
"sae_top_2_test_accuracy": {
"1.0": 0.878,
"5.0": 0.878
},
"sae_top_5_test_accuracy": {
"1.0": 0.885,
"5.0": 0.885
}
},
"codeparrot/github-code_results": {
"sae_test_accuracy": {
"C": 0.9510000348091125,
"Python": 0.9850000739097595,
"HTML": 0.9830000400543213,
"Java": 0.9660000205039978,
"PHP": 0.971000075340271
},
"llm_test_accuracy": {
"C": 0.9600000381469727,
"Python": 0.9880000352859497,
"HTML": 0.9850000739097595,
"Java": 0.9570000171661377,
"PHP": 0.9540000557899475
},
"llm_top_1_test_accuracy": {
"C": 0.559,
"Python": 0.62,
"HTML": 0.801,
"Java": 0.595,
"PHP": 0.574
},
"llm_top_2_test_accuracy": {
"C": 0.657,
"Python": 0.629,
"HTML": 0.872,
"Java": 0.617,
"PHP": 0.646
},
"llm_top_5_test_accuracy": {
"C": 0.852,
"Python": 0.726,
"HTML": 0.89,
"Java": 0.76,
"PHP": 0.685
},
"sae_top_1_test_accuracy": {
"C": 0.789,
"Python": 0.623,
"HTML": 0.924,
"Java": 0.6,
"PHP": 0.93
},
"sae_top_2_test_accuracy": {
"C": 0.892,
"Python": 0.938,
"HTML": 0.94,
"Java": 0.656,
"PHP": 0.938
},
"sae_top_5_test_accuracy": {
"C": 0.881,
"Python": 0.972,
"HTML": 0.952,
"Java": 0.897,
"PHP": 0.935
}
},
"fancyzhx/ag_news_results": {
"sae_test_accuracy": {
"0": 0.9380000233650208,
"1": 0.9790000319480896,
"2": 0.9310000538825989,
"3": 0.9460000395774841
},
"llm_test_accuracy": {
"0": 0.940000057220459,
"1": 0.9880000352859497,
"2": 0.9270000457763672,
"3": 0.9480000734329224
},
"llm_top_1_test_accuracy": {
"0": 0.69,
"1": 0.636,
"2": 0.555,
"3": 0.72
},
"llm_top_2_test_accuracy": {
"0": 0.784,
"1": 0.786,
"2": 0.679,
"3": 0.739
},
"llm_top_5_test_accuracy": {
"0": 0.833,
"1": 0.86,
"2": 0.801,
"3": 0.815
},
"sae_top_1_test_accuracy": {
"0": 0.866,
"1": 0.851,
"2": 0.81,
"3": 0.744
},
"sae_top_2_test_accuracy": {
"0": 0.873,
"1": 0.977,
"2": 0.867,
"3": 0.811
},
"sae_top_5_test_accuracy": {
"0": 0.882,
"1": 0.975,
"2": 0.852,
"3": 0.863
}
},
"Helsinki-NLP/europarl_results": {
"sae_test_accuracy": {
"en": 0.999000072479248,
"fr": 0.999000072479248,
"de": 1.0,
"es": 0.999000072479248,
"nl": 1.0
},
"llm_test_accuracy": {
"en": 1.0,
"fr": 1.0,
"de": 0.999000072479248,
"es": 0.999000072479248,
"nl": 1.0
},
"llm_top_1_test_accuracy": {
"en": 0.94,
"fr": 0.996,
"de": 1.0,
"es": 0.862,
"nl": 1.0
},
"llm_top_2_test_accuracy": {
"en": 0.958,
"fr": 0.996,
"de": 0.999,
"es": 1.0,
"nl": 1.0
},
"llm_top_5_test_accuracy": {
"en": 0.999,
"fr": 0.998,
"de": 0.997,
"es": 1.0,
"nl": 1.0
},
"sae_top_1_test_accuracy": {
"en": 0.944,
"fr": 0.998,
"de": 1.0,
"es": 0.86,
"nl": 1.0
},
"sae_top_2_test_accuracy": {
"en": 0.976,
"fr": 0.997,
"de": 1.0,
"es": 0.999,
"nl": 1.0
},
"sae_top_5_test_accuracy": {
"en": 1.0,
"fr": 0.999,
"de": 1.0,
"es": 0.999,
"nl": 1.0
}
}
}
}