{ "eval_type_id": "sparse_probing", "eval_config": { "random_seed": 42, "dataset_names": [ "LabHC/bias_in_bios_class_set1", "LabHC/bias_in_bios_class_set2", "LabHC/bias_in_bios_class_set3", "canrager/amazon_reviews_mcauley_1and5", "canrager/amazon_reviews_mcauley_1and5_sentiment", "codeparrot/github-code", "fancyzhx/ag_news", "Helsinki-NLP/europarl" ], "probe_train_set_size": 4000, "probe_test_set_size": 1000, "context_length": 128, "sae_batch_size": 125, "llm_batch_size": 32, "llm_dtype": "bfloat16", "model_name": "gemma-2-2b", "k_values": [ 1, 2, 5 ], "lower_vram_usage": false }, "eval_id": "acf88c24-09bd-4561-9b58-1bd96edb30c6", "datetime_epoch_millis": 1745629709739, "eval_result_metrics": { "llm": { "llm_test_accuracy": 0.9593437947332858, "llm_top_1_test_accuracy": 0.70410625, "llm_top_2_test_accuracy": 0.7557, "llm_top_5_test_accuracy": 0.8173062500000001, "llm_top_10_test_accuracy": null, "llm_top_20_test_accuracy": null, "llm_top_50_test_accuracy": null, "llm_top_100_test_accuracy": null }, "sae": { "sae_test_accuracy": 0.9573500454425812, "sae_top_1_test_accuracy": 0.80261875, "sae_top_2_test_accuracy": 0.8779749999999998, "sae_top_5_test_accuracy": 0.9056250000000001, "sae_top_10_test_accuracy": null, "sae_top_20_test_accuracy": null, "sae_top_50_test_accuracy": null, "sae_top_100_test_accuracy": null } }, "eval_result_details": [ { "dataset_name": "LabHC/bias_in_bios_class_set1_results", "llm_test_accuracy": 0.9682000517845154, "llm_top_1_test_accuracy": 0.6666000000000001, "llm_top_2_test_accuracy": 0.7152000000000001, "llm_top_5_test_accuracy": 0.7978, "llm_top_10_test_accuracy": null, "llm_top_20_test_accuracy": null, "llm_top_50_test_accuracy": null, "llm_top_100_test_accuracy": null, "sae_test_accuracy": 0.9646000504493714, "sae_top_1_test_accuracy": 0.876, "sae_top_2_test_accuracy": 0.8783999999999998, "sae_top_5_test_accuracy": 0.9312000000000001, "sae_top_10_test_accuracy": null, "sae_top_20_test_accuracy": null, "sae_top_50_test_accuracy": null, "sae_top_100_test_accuracy": null }, { "dataset_name": "LabHC/bias_in_bios_class_set2_results", "llm_test_accuracy": 0.9588000416755676, "llm_top_1_test_accuracy": 0.6688000000000001, "llm_top_2_test_accuracy": 0.685, "llm_top_5_test_accuracy": 0.7556, "llm_top_10_test_accuracy": null, "llm_top_20_test_accuracy": null, "llm_top_50_test_accuracy": null, "llm_top_100_test_accuracy": null, "sae_test_accuracy": 0.9486000418663025, "sae_top_1_test_accuracy": 0.7762, "sae_top_2_test_accuracy": 0.8555999999999999, "sae_top_5_test_accuracy": 0.8937999999999999, "sae_top_10_test_accuracy": null, "sae_top_20_test_accuracy": null, "sae_top_50_test_accuracy": null, "sae_top_100_test_accuracy": null }, { "dataset_name": "LabHC/bias_in_bios_class_set3_results", "llm_test_accuracy": 0.9284000515937805, "llm_top_1_test_accuracy": 0.6984, "llm_top_2_test_accuracy": 0.7352000000000001, "llm_top_5_test_accuracy": 0.7849999999999999, "llm_top_10_test_accuracy": null, "llm_top_20_test_accuracy": null, "llm_top_50_test_accuracy": null, "llm_top_100_test_accuracy": null, "sae_test_accuracy": 0.9282000541687012, "sae_top_1_test_accuracy": 0.8054, "sae_top_2_test_accuracy": 0.8240000000000001, "sae_top_5_test_accuracy": 0.8680000000000001, "sae_top_10_test_accuracy": null, "sae_top_20_test_accuracy": null, "sae_top_50_test_accuracy": null, "sae_top_100_test_accuracy": null }, { "dataset_name": "canrager/amazon_reviews_mcauley_1and5_results", "llm_test_accuracy": 0.9332000494003296, "llm_top_1_test_accuracy": 0.6744, "llm_top_2_test_accuracy": 0.7394000000000001, "llm_top_5_test_accuracy": 0.8013999999999999, "llm_top_10_test_accuracy": null, "llm_top_20_test_accuracy": null, "llm_top_50_test_accuracy": null, "llm_top_100_test_accuracy": null, "sae_test_accuracy": 0.9358000516891479, "sae_top_1_test_accuracy": 0.8099999999999999, "sae_top_2_test_accuracy": 0.8385999999999999, "sae_top_5_test_accuracy": 0.8469999999999999, "sae_top_10_test_accuracy": null, "sae_top_20_test_accuracy": null, "sae_top_50_test_accuracy": null, "sae_top_100_test_accuracy": null }, { "dataset_name": "canrager/amazon_reviews_mcauley_1and5_sentiment_results", "llm_test_accuracy": 0.9670000374317169, "llm_top_1_test_accuracy": 0.685, "llm_top_2_test_accuracy": 0.749, "llm_top_5_test_accuracy": 0.79, "llm_top_10_test_accuracy": null, "llm_top_20_test_accuracy": null, "llm_top_50_test_accuracy": null, "llm_top_100_test_accuracy": null, "sae_test_accuracy": 0.9625000357627869, "sae_top_1_test_accuracy": 0.602, "sae_top_2_test_accuracy": 0.878, "sae_top_5_test_accuracy": 0.885, "sae_top_10_test_accuracy": null, "sae_top_20_test_accuracy": null, "sae_top_50_test_accuracy": null, "sae_top_100_test_accuracy": null }, { "dataset_name": "codeparrot/github-code_results", "llm_test_accuracy": 0.9688000440597534, "llm_top_1_test_accuracy": 0.6298, "llm_top_2_test_accuracy": 0.6841999999999999, "llm_top_5_test_accuracy": 0.7826, "llm_top_10_test_accuracy": null, "llm_top_20_test_accuracy": null, "llm_top_50_test_accuracy": null, "llm_top_100_test_accuracy": null, "sae_test_accuracy": 0.9712000489234924, "sae_top_1_test_accuracy": 0.7732, "sae_top_2_test_accuracy": 0.8728, "sae_top_5_test_accuracy": 0.9274000000000001, "sae_top_10_test_accuracy": null, "sae_top_20_test_accuracy": null, "sae_top_50_test_accuracy": null, "sae_top_100_test_accuracy": null }, { "dataset_name": "fancyzhx/ag_news_results", "llm_test_accuracy": 0.9507500529289246, "llm_top_1_test_accuracy": 0.65025, "llm_top_2_test_accuracy": 0.747, "llm_top_5_test_accuracy": 0.82725, "llm_top_10_test_accuracy": null, "llm_top_20_test_accuracy": null, "llm_top_50_test_accuracy": null, "llm_top_100_test_accuracy": null, "sae_test_accuracy": 0.9485000371932983, "sae_top_1_test_accuracy": 0.81775, "sae_top_2_test_accuracy": 0.882, "sae_top_5_test_accuracy": 0.893, "sae_top_10_test_accuracy": null, "sae_top_20_test_accuracy": null, "sae_top_50_test_accuracy": null, "sae_top_100_test_accuracy": null }, { "dataset_name": "Helsinki-NLP/europarl_results", "llm_test_accuracy": 0.9996000289916992, "llm_top_1_test_accuracy": 0.9596, "llm_top_2_test_accuracy": 0.9905999999999999, "llm_top_5_test_accuracy": 0.9987999999999999, "llm_top_10_test_accuracy": null, "llm_top_20_test_accuracy": null, "llm_top_50_test_accuracy": null, "llm_top_100_test_accuracy": null, "sae_test_accuracy": 0.9994000434875489, "sae_top_1_test_accuracy": 0.9603999999999999, "sae_top_2_test_accuracy": 0.9944, "sae_top_5_test_accuracy": 0.9996, "sae_top_10_test_accuracy": null, "sae_top_20_test_accuracy": null, "sae_top_50_test_accuracy": null, "sae_top_100_test_accuracy": null } ], "sae_bench_commit_hash": "Unknown", "sae_lens_id": "blocks.21.hook_resid_post", "sae_lens_release_id": "gemma-2-2b-res-snap-matryoshka-dc", "sae_lens_version": "5.9.1", "sae_cfg_dict": { "architecture": "jumprelu", "d_in": 2304, "d_sae": 32768, "activation_fn_str": "relu", "apply_b_dec_to_input": true, "finetuning_scaling_factor": false, "context_size": 1024, "model_name": "gemma-2-2b", "hook_name": "blocks.21.hook_resid_post", "hook_layer": 21, "hook_head_index": null, "prepend_bos": true, "dataset_path": "chanind/pile-uncopyrighted-gemma-1024-abbrv-1B", "dataset_trust_remote_code": true, "normalize_activations": "none", "dtype": "torch.bfloat16", "device": "cuda", "sae_lens_training_version": "5.5.1", "activation_fn_kwargs": { "k": 40 }, "neuronpedia_id": null, "model_from_pretrained_kwargs": { "center_writing_weights": false }, "seqpos_slice": [ null ] }, "eval_result_unstructured": { "LabHC/bias_in_bios_class_set1_results": { "sae_test_accuracy": { "0": 0.9450000524520874, "1": 0.9630000591278076, "2": 0.9460000395774841, "6": 0.9900000691413879, "9": 0.9790000319480896 }, "llm_test_accuracy": { "0": 0.9510000348091125, "1": 0.9630000591278076, "2": 0.9580000638961792, "6": 0.9880000352859497, "9": 0.9810000658035278 }, "llm_top_1_test_accuracy": { "0": 0.578, "1": 0.658, "2": 0.691, "6": 0.744, "9": 0.662 }, "llm_top_2_test_accuracy": { "0": 0.567, "1": 0.67, "2": 0.786, "6": 0.807, "9": 0.746 }, "llm_top_5_test_accuracy": { "0": 0.619, "1": 0.708, "2": 0.828, "6": 0.906, "9": 0.928 }, "sae_top_1_test_accuracy": { "0": 0.777, "1": 0.795, "2": 0.879, "6": 0.98, "9": 0.949 }, "sae_top_2_test_accuracy": { "0": 0.768, "1": 0.802, "2": 0.879, "6": 0.989, "9": 0.954 }, "sae_top_5_test_accuracy": { "0": 0.888, "1": 0.914, "2": 0.909, "6": 0.991, "9": 0.954 } }, "LabHC/bias_in_bios_class_set2_results": { "sae_test_accuracy": { "11": 0.9580000638961792, "13": 0.9430000185966492, "14": 0.9600000381469727, "18": 0.9320000410079956, "19": 0.9500000476837158 }, "llm_test_accuracy": { "11": 0.9660000205039978, "13": 0.956000030040741, "14": 0.9690000414848328, "18": 0.9410000443458557, "19": 0.9620000720024109 }, "llm_top_1_test_accuracy": { "11": 0.558, "13": 0.666, "14": 0.647, "18": 0.695, "19": 0.778 }, "llm_top_2_test_accuracy": { "11": 0.598, "13": 0.675, "14": 0.656, "18": 0.693, "19": 0.803 }, "llm_top_5_test_accuracy": { "11": 0.83, "13": 0.756, "14": 0.658, "18": 0.741, "19": 0.793 }, "sae_top_1_test_accuracy": { "11": 0.731, "13": 0.674, "14": 0.909, "18": 0.719, "19": 0.848 }, "sae_top_2_test_accuracy": { "11": 0.858, "13": 0.78, "14": 0.917, "18": 0.858, "19": 0.865 }, "sae_top_5_test_accuracy": { "11": 0.934, "13": 0.804, "14": 0.921, "18": 0.92, "19": 0.89 } }, "LabHC/bias_in_bios_class_set3_results": { "sae_test_accuracy": { "20": 0.9470000267028809, "21": 0.9160000681877136, "22": 0.9250000715255737, "25": 0.9640000462532043, "26": 0.8890000581741333 }, "llm_test_accuracy": { "20": 0.9610000252723694, "21": 0.921000063419342, "22": 0.9160000681877136, "25": 0.9600000381469727, "26": 0.8840000629425049 }, "llm_top_1_test_accuracy": { "20": 0.725, "21": 0.755, "22": 0.652, "25": 0.714, "26": 0.646 }, "llm_top_2_test_accuracy": { "20": 0.81, "21": 0.775, "22": 0.694, "25": 0.738, "26": 0.659 }, "llm_top_5_test_accuracy": { "20": 0.824, "21": 0.807, "22": 0.801, "25": 0.787, "26": 0.706 }, "sae_top_1_test_accuracy": { "20": 0.852, "21": 0.797, "22": 0.858, "25": 0.878, "26": 0.642 }, "sae_top_2_test_accuracy": { "20": 0.869, "21": 0.807, "22": 0.851, "25": 0.861, "26": 0.732 }, "sae_top_5_test_accuracy": { "20": 0.928, "21": 0.85, "22": 0.861, "25": 0.898, "26": 0.803 } }, "canrager/amazon_reviews_mcauley_1and5_results": { "sae_test_accuracy": { "1": 0.9610000252723694, "2": 0.9410000443458557, "3": 0.940000057220459, "5": 0.9440000653266907, "6": 0.893000066280365 }, "llm_test_accuracy": { "1": 0.9630000591278076, "2": 0.940000057220459, "3": 0.937000036239624, "5": 0.9330000281333923, "6": 0.893000066280365 }, "llm_top_1_test_accuracy": { "1": 0.688, "2": 0.795, "3": 0.63, "5": 0.566, "6": 0.693 }, "llm_top_2_test_accuracy": { "1": 0.804, "2": 0.803, "3": 0.642, "5": 0.776, "6": 0.672 }, "llm_top_5_test_accuracy": { "1": 0.824, "2": 0.886, "3": 0.762, "5": 0.822, "6": 0.713 }, "sae_top_1_test_accuracy": { "1": 0.888, "2": 0.864, "3": 0.683, "5": 0.864, "6": 0.751 }, "sae_top_2_test_accuracy": { "1": 0.906, "2": 0.868, "3": 0.753, "5": 0.896, "6": 0.77 }, "sae_top_5_test_accuracy": { "1": 0.924, "2": 0.891, "3": 0.772, "5": 0.888, "6": 0.76 } }, "canrager/amazon_reviews_mcauley_1and5_sentiment_results": { "sae_test_accuracy": { "1.0": 0.9640000462532043, "5.0": 0.9610000252723694 }, "llm_test_accuracy": { "1.0": 0.9650000333786011, "5.0": 0.9690000414848328 }, "llm_top_1_test_accuracy": { "1.0": 0.685, "5.0": 0.685 }, "llm_top_2_test_accuracy": { "1.0": 0.749, "5.0": 0.749 }, "llm_top_5_test_accuracy": { "1.0": 0.79, "5.0": 0.79 }, "sae_top_1_test_accuracy": { "1.0": 0.602, "5.0": 0.602 }, "sae_top_2_test_accuracy": { "1.0": 0.878, "5.0": 0.878 }, "sae_top_5_test_accuracy": { "1.0": 0.885, "5.0": 0.885 } }, "codeparrot/github-code_results": { "sae_test_accuracy": { "C": 0.9510000348091125, "Python": 0.9850000739097595, "HTML": 0.9830000400543213, "Java": 0.9660000205039978, "PHP": 0.971000075340271 }, "llm_test_accuracy": { "C": 0.9600000381469727, "Python": 0.9880000352859497, "HTML": 0.9850000739097595, "Java": 0.9570000171661377, "PHP": 0.9540000557899475 }, "llm_top_1_test_accuracy": { "C": 0.559, "Python": 0.62, "HTML": 0.801, "Java": 0.595, "PHP": 0.574 }, "llm_top_2_test_accuracy": { "C": 0.657, "Python": 0.629, "HTML": 0.872, "Java": 0.617, "PHP": 0.646 }, "llm_top_5_test_accuracy": { "C": 0.852, "Python": 0.726, "HTML": 0.89, "Java": 0.76, "PHP": 0.685 }, "sae_top_1_test_accuracy": { "C": 0.789, "Python": 0.623, "HTML": 0.924, "Java": 0.6, "PHP": 0.93 }, "sae_top_2_test_accuracy": { "C": 0.892, "Python": 0.938, "HTML": 0.94, "Java": 0.656, "PHP": 0.938 }, "sae_top_5_test_accuracy": { "C": 0.881, "Python": 0.972, "HTML": 0.952, "Java": 0.897, "PHP": 0.935 } }, "fancyzhx/ag_news_results": { "sae_test_accuracy": { "0": 0.9380000233650208, "1": 0.9790000319480896, "2": 0.9310000538825989, "3": 0.9460000395774841 }, "llm_test_accuracy": { "0": 0.940000057220459, "1": 0.9880000352859497, "2": 0.9270000457763672, "3": 0.9480000734329224 }, "llm_top_1_test_accuracy": { "0": 0.69, "1": 0.636, "2": 0.555, "3": 0.72 }, "llm_top_2_test_accuracy": { "0": 0.784, "1": 0.786, "2": 0.679, "3": 0.739 }, "llm_top_5_test_accuracy": { "0": 0.833, "1": 0.86, "2": 0.801, "3": 0.815 }, "sae_top_1_test_accuracy": { "0": 0.866, "1": 0.851, "2": 0.81, "3": 0.744 }, "sae_top_2_test_accuracy": { "0": 0.873, "1": 0.977, "2": 0.867, "3": 0.811 }, "sae_top_5_test_accuracy": { "0": 0.882, "1": 0.975, "2": 0.852, "3": 0.863 } }, "Helsinki-NLP/europarl_results": { "sae_test_accuracy": { "en": 0.999000072479248, "fr": 0.999000072479248, "de": 1.0, "es": 0.999000072479248, "nl": 1.0 }, "llm_test_accuracy": { "en": 1.0, "fr": 1.0, "de": 0.999000072479248, "es": 0.999000072479248, "nl": 1.0 }, "llm_top_1_test_accuracy": { "en": 0.94, "fr": 0.996, "de": 1.0, "es": 0.862, "nl": 1.0 }, "llm_top_2_test_accuracy": { "en": 0.958, "fr": 0.996, "de": 0.999, "es": 1.0, "nl": 1.0 }, "llm_top_5_test_accuracy": { "en": 0.999, "fr": 0.998, "de": 0.997, "es": 1.0, "nl": 1.0 }, "sae_top_1_test_accuracy": { "en": 0.944, "fr": 0.998, "de": 1.0, "es": 0.86, "nl": 1.0 }, "sae_top_2_test_accuracy": { "en": 0.976, "fr": 0.997, "de": 1.0, "es": 0.999, "nl": 1.0 }, "sae_top_5_test_accuracy": { "en": 1.0, "fr": 0.999, "de": 1.0, "es": 0.999, "nl": 1.0 } } } }