| { |
| "eval_type_id": "tpp", |
| "eval_config": { |
| "random_seed": 42, |
| "dataset_names": [ |
| "LabHC/bias_in_bios_class_set1", |
| "canrager/amazon_reviews_mcauley_1and5" |
| ], |
| "perform_scr": false, |
| "early_stopping_patience": 20, |
| "train_set_size": 4000, |
| "test_set_size": 1000, |
| "context_length": 128, |
| "probe_train_batch_size": 16, |
| "probe_test_batch_size": 500, |
| "probe_epochs": 20, |
| "probe_lr": 0.001, |
| "probe_l1_penalty": 0.001, |
| "sae_batch_size": 125, |
| "llm_batch_size": 32, |
| "llm_dtype": "bfloat16", |
| "lower_vram_usage": false, |
| "model_name": "gemma-2-2b", |
| "n_values": [ |
| 2, |
| 5, |
| 10, |
| 20, |
| 50, |
| 100, |
| 500 |
| ], |
| "column1_vals_lookup": { |
| "LabHC/bias_in_bios_class_set1": [ |
| [ |
| "professor", |
| "nurse" |
| ], |
| [ |
| "architect", |
| "journalist" |
| ], |
| [ |
| "surgeon", |
| "psychologist" |
| ], |
| [ |
| "attorney", |
| "teacher" |
| ] |
| ], |
| "canrager/amazon_reviews_mcauley_1and5": [ |
| [ |
| "Books", |
| "CDs_and_Vinyl" |
| ], |
| [ |
| "Software", |
| "Electronics" |
| ], |
| [ |
| "Pet_Supplies", |
| "Office_Products" |
| ], |
| [ |
| "Industrial_and_Scientific", |
| "Toys_and_Games" |
| ] |
| ] |
| } |
| }, |
| "eval_id": "65b37170-42a3-4f63-bfdc-b4728a912b3a", |
| "datetime_epoch_millis": 1745618053623, |
| "eval_result_metrics": { |
| "tpp_metrics": { |
| "tpp_threshold_2_total_metric": 0.06602499783039094, |
| "tpp_threshold_2_intended_diff_only": 0.08669999837875367, |
| "tpp_threshold_2_unintended_diff_only": 0.020675000548362733, |
| "tpp_threshold_5_total_metric": 0.12235000431537629, |
| "tpp_threshold_5_intended_diff_only": 0.1671000063419342, |
| "tpp_threshold_5_unintended_diff_only": 0.04475000202655792, |
| "tpp_threshold_10_total_metric": 0.13342500627040862, |
| "tpp_threshold_10_intended_diff_only": 0.21480000615119935, |
| "tpp_threshold_10_unintended_diff_only": 0.08137499988079071, |
| "tpp_threshold_20_total_metric": 0.1342000052332878, |
| "tpp_threshold_20_intended_diff_only": 0.2546000123023987, |
| "tpp_threshold_20_unintended_diff_only": 0.12040000706911087, |
| "tpp_threshold_50_total_metric": 0.18002499938011168, |
| "tpp_threshold_50_intended_diff_only": 0.3384000062942505, |
| "tpp_threshold_50_unintended_diff_only": 0.1583750069141388, |
| "tpp_threshold_100_total_metric": 0.21695000678300858, |
| "tpp_threshold_100_intended_diff_only": 0.40460001230239867, |
| "tpp_threshold_100_unintended_diff_only": 0.1876500055193901, |
| "tpp_threshold_500_total_metric": 0.21200003325939176, |
| "tpp_threshold_500_intended_diff_only": 0.44580004215240476, |
| "tpp_threshold_500_unintended_diff_only": 0.233800008893013 |
| } |
| }, |
| "eval_result_details": [ |
| { |
| "dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results", |
| "tpp_threshold_2_total_metric": 0.1222499966621399, |
| "tpp_threshold_2_intended_diff_only": 0.1615999937057495, |
| "tpp_threshold_2_unintended_diff_only": 0.03934999704360962, |
| "tpp_threshold_5_total_metric": 0.2191500186920166, |
| "tpp_threshold_5_intended_diff_only": 0.30340001583099363, |
| "tpp_threshold_5_unintended_diff_only": 0.08424999713897705, |
| "tpp_threshold_10_total_metric": 0.2258500188589096, |
| "tpp_threshold_10_intended_diff_only": 0.38160001039505004, |
| "tpp_threshold_10_unintended_diff_only": 0.15574999153614044, |
| "tpp_threshold_20_total_metric": 0.19325000345706939, |
| "tpp_threshold_20_intended_diff_only": 0.4192000150680542, |
| "tpp_threshold_20_unintended_diff_only": 0.2259500116109848, |
| "tpp_threshold_50_total_metric": 0.15480000376701356, |
| "tpp_threshold_50_intended_diff_only": 0.4492000102996826, |
| "tpp_threshold_50_unintended_diff_only": 0.29440000653266907, |
| "tpp_threshold_100_total_metric": 0.136950021982193, |
| "tpp_threshold_100_intended_diff_only": 0.46260002851486204, |
| "tpp_threshold_100_unintended_diff_only": 0.32565000653266907, |
| "tpp_threshold_500_total_metric": 0.08265002667903901, |
| "tpp_threshold_500_intended_diff_only": 0.46920003890991213, |
| "tpp_threshold_500_unintended_diff_only": 0.3865500122308731 |
| }, |
| { |
| "dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results", |
| "tpp_threshold_2_total_metric": 0.009799998998641968, |
| "tpp_threshold_2_intended_diff_only": 0.011800003051757813, |
| "tpp_threshold_2_unintended_diff_only": 0.0020000040531158447, |
| "tpp_threshold_5_total_metric": 0.025549989938735963, |
| "tpp_threshold_5_intended_diff_only": 0.030799996852874757, |
| "tpp_threshold_5_unintended_diff_only": 0.005250006914138794, |
| "tpp_threshold_10_total_metric": 0.040999993681907654, |
| "tpp_threshold_10_intended_diff_only": 0.048000001907348634, |
| "tpp_threshold_10_unintended_diff_only": 0.007000008225440979, |
| "tpp_threshold_20_total_metric": 0.07515000700950622, |
| "tpp_threshold_20_intended_diff_only": 0.09000000953674317, |
| "tpp_threshold_20_unintended_diff_only": 0.01485000252723694, |
| "tpp_threshold_50_total_metric": 0.20524999499320984, |
| "tpp_threshold_50_intended_diff_only": 0.22760000228881835, |
| "tpp_threshold_50_unintended_diff_only": 0.022350007295608522, |
| "tpp_threshold_100_total_metric": 0.2969499915838242, |
| "tpp_threshold_100_intended_diff_only": 0.3465999960899353, |
| "tpp_threshold_100_unintended_diff_only": 0.049650004506111144, |
| "tpp_threshold_500_total_metric": 0.34135003983974455, |
| "tpp_threshold_500_intended_diff_only": 0.42240004539489745, |
| "tpp_threshold_500_unintended_diff_only": 0.08105000555515289 |
| } |
| ], |
| "sae_bench_commit_hash": "Unknown", |
| "sae_lens_id": "blocks.9.hook_resid_post", |
| "sae_lens_release_id": "gemma-2-2b-res-snap-matryoshka-dc", |
| "sae_lens_version": "5.9.1", |
| "sae_cfg_dict": { |
| "architecture": "jumprelu", |
| "d_in": 2304, |
| "d_sae": 32768, |
| "activation_fn_str": "relu", |
| "apply_b_dec_to_input": true, |
| "finetuning_scaling_factor": false, |
| "context_size": 1024, |
| "model_name": "gemma-2-2b", |
| "hook_name": "blocks.9.hook_resid_post", |
| "hook_layer": 9, |
| "hook_head_index": null, |
| "prepend_bos": true, |
| "dataset_path": "chanind/pile-uncopyrighted-gemma-1024-abbrv-1B", |
| "dataset_trust_remote_code": true, |
| "normalize_activations": "none", |
| "dtype": "torch.bfloat16", |
| "device": "cuda", |
| "sae_lens_training_version": "5.5.1", |
| "activation_fn_kwargs": { |
| "k": 40 |
| }, |
| "neuronpedia_id": null, |
| "model_from_pretrained_kwargs": { |
| "center_writing_weights": false |
| }, |
| "seqpos_slice": [ |
| null |
| ] |
| }, |
| "eval_result_unstructured": { |
| "LabHC/bias_in_bios_class_set1": { |
| "0": { |
| "tpp_threshold_2_total_metric": 0.19500000774860382, |
| "tpp_threshold_2_intended_diff_only": 0.28700000047683716, |
| "tpp_threshold_2_unintended_diff_only": 0.09199999272823334, |
| "tpp_threshold_5_total_metric": 0.22450006008148193, |
| "tpp_threshold_5_intended_diff_only": 0.3290000557899475, |
| "tpp_threshold_5_unintended_diff_only": 0.10449999570846558, |
| "tpp_threshold_10_total_metric": 0.21625006198883057, |
| "tpp_threshold_10_intended_diff_only": 0.35700005292892456, |
| "tpp_threshold_10_unintended_diff_only": 0.140749990940094, |
| "tpp_threshold_20_total_metric": 0.15175005793571472, |
| "tpp_threshold_20_intended_diff_only": 0.40400004386901855, |
| "tpp_threshold_20_unintended_diff_only": 0.25224998593330383, |
| "tpp_threshold_50_total_metric": 0.11925002932548523, |
| "tpp_threshold_50_intended_diff_only": 0.43300002813339233, |
| "tpp_threshold_50_unintended_diff_only": 0.3137499988079071, |
| "tpp_threshold_100_total_metric": 0.122250035405159, |
| "tpp_threshold_100_intended_diff_only": 0.44600003957748413, |
| "tpp_threshold_100_unintended_diff_only": 0.32375000417232513, |
| "tpp_threshold_500_total_metric": 0.05800004303455353, |
| "tpp_threshold_500_intended_diff_only": 0.4490000605583191, |
| "tpp_threshold_500_unintended_diff_only": 0.39100001752376556 |
| }, |
| "1": { |
| "tpp_threshold_2_total_metric": 0.12150003015995026, |
| "tpp_threshold_2_intended_diff_only": 0.15000003576278687, |
| "tpp_threshold_2_unintended_diff_only": 0.02850000560283661, |
| "tpp_threshold_5_total_metric": 0.17075003683567047, |
| "tpp_threshold_5_intended_diff_only": 0.20200002193450928, |
| "tpp_threshold_5_unintended_diff_only": 0.031249985098838806, |
| "tpp_threshold_10_total_metric": 0.1925000101327896, |
| "tpp_threshold_10_intended_diff_only": 0.3240000009536743, |
| "tpp_threshold_10_unintended_diff_only": 0.1314999908208847, |
| "tpp_threshold_20_total_metric": 0.1742500364780426, |
| "tpp_threshold_20_intended_diff_only": 0.3670000433921814, |
| "tpp_threshold_20_unintended_diff_only": 0.1927500069141388, |
| "tpp_threshold_50_total_metric": 0.13425002992153168, |
| "tpp_threshold_50_intended_diff_only": 0.409000039100647, |
| "tpp_threshold_50_unintended_diff_only": 0.2747500091791153, |
| "tpp_threshold_100_total_metric": 0.10575003921985626, |
| "tpp_threshold_100_intended_diff_only": 0.4450000524520874, |
| "tpp_threshold_100_unintended_diff_only": 0.33925001323223114, |
| "tpp_threshold_500_total_metric": 0.07150006294250488, |
| "tpp_threshold_500_intended_diff_only": 0.4630000591278076, |
| "tpp_threshold_500_unintended_diff_only": 0.39149999618530273 |
| }, |
| "2": { |
| "tpp_threshold_2_total_metric": 0.1105000227689743, |
| "tpp_threshold_2_intended_diff_only": 0.15200001001358032, |
| "tpp_threshold_2_unintended_diff_only": 0.04149998724460602, |
| "tpp_threshold_5_total_metric": 0.1745000034570694, |
| "tpp_threshold_5_intended_diff_only": 0.29100000858306885, |
| "tpp_threshold_5_unintended_diff_only": 0.11650000512599945, |
| "tpp_threshold_10_total_metric": 0.14624999463558197, |
| "tpp_threshold_10_intended_diff_only": 0.35199999809265137, |
| "tpp_threshold_10_unintended_diff_only": 0.2057500034570694, |
| "tpp_threshold_20_total_metric": 0.16099998354911804, |
| "tpp_threshold_20_intended_diff_only": 0.3840000033378601, |
| "tpp_threshold_20_unintended_diff_only": 0.22300001978874207, |
| "tpp_threshold_50_total_metric": 0.11650003492832184, |
| "tpp_threshold_50_intended_diff_only": 0.4280000329017639, |
| "tpp_threshold_50_unintended_diff_only": 0.3114999979734421, |
| "tpp_threshold_100_total_metric": 0.101500004529953, |
| "tpp_threshold_100_intended_diff_only": 0.4440000057220459, |
| "tpp_threshold_100_unintended_diff_only": 0.3425000011920929, |
| "tpp_threshold_500_total_metric": 0.05625002086162567, |
| "tpp_threshold_500_intended_diff_only": 0.45600003004074097, |
| "tpp_threshold_500_unintended_diff_only": 0.3997500091791153 |
| }, |
| "6": { |
| "tpp_threshold_2_total_metric": 0.13799996674060822, |
| "tpp_threshold_2_intended_diff_only": 0.16499996185302734, |
| "tpp_threshold_2_unintended_diff_only": 0.02699999511241913, |
| "tpp_threshold_5_total_metric": 0.31074999272823334, |
| "tpp_threshold_5_intended_diff_only": 0.4269999861717224, |
| "tpp_threshold_5_unintended_diff_only": 0.11624999344348907, |
| "tpp_threshold_10_total_metric": 0.33125002682209015, |
| "tpp_threshold_10_intended_diff_only": 0.47600001096725464, |
| "tpp_threshold_10_unintended_diff_only": 0.1447499841451645, |
| "tpp_threshold_20_total_metric": 0.26499998569488525, |
| "tpp_threshold_20_intended_diff_only": 0.49000000953674316, |
| "tpp_threshold_20_unintended_diff_only": 0.2250000238418579, |
| "tpp_threshold_50_total_metric": 0.22049996256828308, |
| "tpp_threshold_50_intended_diff_only": 0.49699997901916504, |
| "tpp_threshold_50_unintended_diff_only": 0.27650001645088196, |
| "tpp_threshold_100_total_metric": 0.2042500227689743, |
| "tpp_threshold_100_intended_diff_only": 0.49800002574920654, |
| "tpp_threshold_100_unintended_diff_only": 0.29375000298023224, |
| "tpp_threshold_500_total_metric": 0.1315000057220459, |
| "tpp_threshold_500_intended_diff_only": 0.49800002574920654, |
| "tpp_threshold_500_unintended_diff_only": 0.36650002002716064 |
| }, |
| "9": { |
| "tpp_threshold_2_total_metric": 0.046249955892562866, |
| "tpp_threshold_2_intended_diff_only": 0.05399996042251587, |
| "tpp_threshold_2_unintended_diff_only": 0.007750004529953003, |
| "tpp_threshold_5_total_metric": 0.21525000035762787, |
| "tpp_threshold_5_intended_diff_only": 0.2680000066757202, |
| "tpp_threshold_5_unintended_diff_only": 0.052750006318092346, |
| "tpp_threshold_10_total_metric": 0.24300000071525574, |
| "tpp_threshold_10_intended_diff_only": 0.39899998903274536, |
| "tpp_threshold_10_unintended_diff_only": 0.15599998831748962, |
| "tpp_threshold_20_total_metric": 0.21424995362758636, |
| "tpp_threshold_20_intended_diff_only": 0.4509999752044678, |
| "tpp_threshold_20_unintended_diff_only": 0.2367500215768814, |
| "tpp_threshold_50_total_metric": 0.18349996209144592, |
| "tpp_threshold_50_intended_diff_only": 0.4789999723434448, |
| "tpp_threshold_50_unintended_diff_only": 0.2955000102519989, |
| "tpp_threshold_100_total_metric": 0.1510000079870224, |
| "tpp_threshold_100_intended_diff_only": 0.48000001907348633, |
| "tpp_threshold_100_unintended_diff_only": 0.32900001108646393, |
| "tpp_threshold_500_total_metric": 0.09600000083446503, |
| "tpp_threshold_500_intended_diff_only": 0.48000001907348633, |
| "tpp_threshold_500_unintended_diff_only": 0.3840000182390213 |
| } |
| }, |
| "canrager/amazon_reviews_mcauley_1and5": { |
| "1": { |
| "tpp_threshold_2_total_metric": 0.008999988436698914, |
| "tpp_threshold_2_intended_diff_only": 0.009999990463256836, |
| "tpp_threshold_2_unintended_diff_only": 0.0010000020265579224, |
| "tpp_threshold_5_total_metric": 0.008499950170516968, |
| "tpp_threshold_5_intended_diff_only": 0.011999964714050293, |
| "tpp_threshold_5_unintended_diff_only": 0.003500014543533325, |
| "tpp_threshold_10_total_metric": 0.015000015497207642, |
| "tpp_threshold_10_intended_diff_only": 0.022000014781951904, |
| "tpp_threshold_10_unintended_diff_only": 0.006999999284744263, |
| "tpp_threshold_20_total_metric": 0.02699996531009674, |
| "tpp_threshold_20_intended_diff_only": 0.04399996995925903, |
| "tpp_threshold_20_unintended_diff_only": 0.017000004649162292, |
| "tpp_threshold_50_total_metric": 0.09549997746944427, |
| "tpp_threshold_50_intended_diff_only": 0.11699998378753662, |
| "tpp_threshold_50_unintended_diff_only": 0.021500006318092346, |
| "tpp_threshold_100_total_metric": 0.21399997174739838, |
| "tpp_threshold_100_intended_diff_only": 0.24699997901916504, |
| "tpp_threshold_100_unintended_diff_only": 0.03300000727176666, |
| "tpp_threshold_500_total_metric": 0.38450001180171967, |
| "tpp_threshold_500_intended_diff_only": 0.44700002670288086, |
| "tpp_threshold_500_unintended_diff_only": 0.0625000149011612 |
| }, |
| "2": { |
| "tpp_threshold_2_total_metric": 0.011000022292137146, |
| "tpp_threshold_2_intended_diff_only": 0.012000024318695068, |
| "tpp_threshold_2_unintended_diff_only": 0.0010000020265579224, |
| "tpp_threshold_5_total_metric": 0.022250041365623474, |
| "tpp_threshold_5_intended_diff_only": 0.029000043869018555, |
| "tpp_threshold_5_unintended_diff_only": 0.006750002503395081, |
| "tpp_threshold_10_total_metric": 0.04749998450279236, |
| "tpp_threshold_10_intended_diff_only": 0.0559999942779541, |
| "tpp_threshold_10_unintended_diff_only": 0.008500009775161743, |
| "tpp_threshold_20_total_metric": 0.08575001358985901, |
| "tpp_threshold_20_intended_diff_only": 0.10600000619888306, |
| "tpp_threshold_20_unintended_diff_only": 0.020249992609024048, |
| "tpp_threshold_50_total_metric": 0.30675002932548523, |
| "tpp_threshold_50_intended_diff_only": 0.3450000286102295, |
| "tpp_threshold_50_unintended_diff_only": 0.03824999928474426, |
| "tpp_threshold_100_total_metric": 0.3752500116825104, |
| "tpp_threshold_100_intended_diff_only": 0.42500001192092896, |
| "tpp_threshold_100_unintended_diff_only": 0.04975000023841858, |
| "tpp_threshold_500_total_metric": 0.3630000799894333, |
| "tpp_threshold_500_intended_diff_only": 0.4440000653266907, |
| "tpp_threshold_500_unintended_diff_only": 0.08099998533725739 |
| }, |
| "3": { |
| "tpp_threshold_2_total_metric": 0.001999989151954651, |
| "tpp_threshold_2_intended_diff_only": 0.0009999871253967285, |
| "tpp_threshold_2_unintended_diff_only": -0.0010000020265579224, |
| "tpp_threshold_5_total_metric": -0.0002500265836715698, |
| "tpp_threshold_5_intended_diff_only": 0.0059999823570251465, |
| "tpp_threshold_5_unintended_diff_only": 0.006250008940696716, |
| "tpp_threshold_10_total_metric": -0.00625002384185791, |
| "tpp_threshold_10_intended_diff_only": 0.0009999871253967285, |
| "tpp_threshold_10_unintended_diff_only": 0.007250010967254639, |
| "tpp_threshold_20_total_metric": 0.018999993801116943, |
| "tpp_threshold_20_intended_diff_only": 0.027000010013580322, |
| "tpp_threshold_20_unintended_diff_only": 0.008000016212463379, |
| "tpp_threshold_50_total_metric": 0.0637499988079071, |
| "tpp_threshold_50_intended_diff_only": 0.078000009059906, |
| "tpp_threshold_50_unintended_diff_only": 0.014250010251998901, |
| "tpp_threshold_100_total_metric": 0.20999999344348907, |
| "tpp_threshold_100_intended_diff_only": 0.3109999895095825, |
| "tpp_threshold_100_unintended_diff_only": 0.10099999606609344, |
| "tpp_threshold_500_total_metric": 0.26975002884864807, |
| "tpp_threshold_500_intended_diff_only": 0.4180000424385071, |
| "tpp_threshold_500_unintended_diff_only": 0.148250013589859 |
| }, |
| "5": { |
| "tpp_threshold_2_total_metric": 0.0012499988079071045, |
| "tpp_threshold_2_intended_diff_only": 0.008000016212463379, |
| "tpp_threshold_2_unintended_diff_only": 0.006750017404556274, |
| "tpp_threshold_5_total_metric": 0.04625000059604645, |
| "tpp_threshold_5_intended_diff_only": 0.05500000715255737, |
| "tpp_threshold_5_unintended_diff_only": 0.008750006556510925, |
| "tpp_threshold_10_total_metric": 0.09175001084804535, |
| "tpp_threshold_10_intended_diff_only": 0.10000002384185791, |
| "tpp_threshold_10_unintended_diff_only": 0.008250012993812561, |
| "tpp_threshold_20_total_metric": 0.13075003027915955, |
| "tpp_threshold_20_intended_diff_only": 0.14600002765655518, |
| "tpp_threshold_20_unintended_diff_only": 0.01524999737739563, |
| "tpp_threshold_50_total_metric": 0.3217499703168869, |
| "tpp_threshold_50_intended_diff_only": 0.33899998664855957, |
| "tpp_threshold_50_unintended_diff_only": 0.01725001633167267, |
| "tpp_threshold_100_total_metric": 0.35899998247623444, |
| "tpp_threshold_100_intended_diff_only": 0.39800000190734863, |
| "tpp_threshold_100_unintended_diff_only": 0.0390000194311142, |
| "tpp_threshold_500_total_metric": 0.34825003147125244, |
| "tpp_threshold_500_intended_diff_only": 0.4180000424385071, |
| "tpp_threshold_500_unintended_diff_only": 0.06975001096725464 |
| }, |
| "6": { |
| "tpp_threshold_2_total_metric": 0.025749996304512024, |
| "tpp_threshold_2_intended_diff_only": 0.02799999713897705, |
| "tpp_threshold_2_unintended_diff_only": 0.002250000834465027, |
| "tpp_threshold_5_total_metric": 0.05099998414516449, |
| "tpp_threshold_5_intended_diff_only": 0.05199998617172241, |
| "tpp_threshold_5_unintended_diff_only": 0.0010000020265579224, |
| "tpp_threshold_10_total_metric": 0.05699998140335083, |
| "tpp_threshold_10_intended_diff_only": 0.06099998950958252, |
| "tpp_threshold_10_unintended_diff_only": 0.0040000081062316895, |
| "tpp_threshold_20_total_metric": 0.11325003206729889, |
| "tpp_threshold_20_intended_diff_only": 0.12700003385543823, |
| "tpp_threshold_20_unintended_diff_only": 0.013750001788139343, |
| "tpp_threshold_50_total_metric": 0.23849999904632568, |
| "tpp_threshold_50_intended_diff_only": 0.2590000033378601, |
| "tpp_threshold_50_unintended_diff_only": 0.020500004291534424, |
| "tpp_threshold_100_total_metric": 0.3264999985694885, |
| "tpp_threshold_100_intended_diff_only": 0.35199999809265137, |
| "tpp_threshold_100_unintended_diff_only": 0.025499999523162842, |
| "tpp_threshold_500_total_metric": 0.3412500470876694, |
| "tpp_threshold_500_intended_diff_only": 0.3850000500679016, |
| "tpp_threshold_500_unintended_diff_only": 0.04375000298023224 |
| } |
| } |
| } |
| } |