| { |
| "eval_type_id": "tpp", |
| "eval_config": { |
| "random_seed": 42, |
| "dataset_names": [ |
| "LabHC/bias_in_bios_class_set1", |
| "canrager/amazon_reviews_mcauley_1and5" |
| ], |
| "perform_scr": false, |
| "early_stopping_patience": 20, |
| "train_set_size": 4000, |
| "test_set_size": 1000, |
| "context_length": 128, |
| "probe_train_batch_size": 16, |
| "probe_test_batch_size": 500, |
| "probe_epochs": 20, |
| "probe_lr": 0.001, |
| "probe_l1_penalty": 0.001, |
| "sae_batch_size": 125, |
| "llm_batch_size": 32, |
| "llm_dtype": "bfloat16", |
| "lower_vram_usage": false, |
| "model_name": "gemma-2-2b", |
| "n_values": [ |
| 2, |
| 5, |
| 10, |
| 20, |
| 50, |
| 100, |
| 500 |
| ], |
| "column1_vals_lookup": { |
| "LabHC/bias_in_bios_class_set1": [ |
| [ |
| "professor", |
| "nurse" |
| ], |
| [ |
| "architect", |
| "journalist" |
| ], |
| [ |
| "surgeon", |
| "psychologist" |
| ], |
| [ |
| "attorney", |
| "teacher" |
| ] |
| ], |
| "canrager/amazon_reviews_mcauley_1and5": [ |
| [ |
| "Books", |
| "CDs_and_Vinyl" |
| ], |
| [ |
| "Software", |
| "Electronics" |
| ], |
| [ |
| "Pet_Supplies", |
| "Office_Products" |
| ], |
| [ |
| "Industrial_and_Scientific", |
| "Toys_and_Games" |
| ] |
| ] |
| } |
| }, |
| "eval_id": "91ffb92c-ec01-47b1-9f45-773ec12047b3", |
| "datetime_epoch_millis": 1745752059044, |
| "eval_result_metrics": { |
| "tpp_metrics": { |
| "tpp_threshold_2_total_metric": 0.07742500007152557, |
| "tpp_threshold_2_intended_diff_only": 0.11740000247955322, |
| "tpp_threshold_2_unintended_diff_only": 0.03997500240802765, |
| "tpp_threshold_5_total_metric": 0.1333000048995018, |
| "tpp_threshold_5_intended_diff_only": 0.20090001225471496, |
| "tpp_threshold_5_unintended_diff_only": 0.06760000735521317, |
| "tpp_threshold_10_total_metric": 0.16679999828338624, |
| "tpp_threshold_10_intended_diff_only": 0.2764000058174133, |
| "tpp_threshold_10_unintended_diff_only": 0.1096000075340271, |
| "tpp_threshold_20_total_metric": 0.2350000128149986, |
| "tpp_threshold_20_intended_diff_only": 0.3734000205993653, |
| "tpp_threshold_20_unintended_diff_only": 0.1384000077843666, |
| "tpp_threshold_50_total_metric": 0.2238250344991684, |
| "tpp_threshold_50_intended_diff_only": 0.41890003681182864, |
| "tpp_threshold_50_unintended_diff_only": 0.19507500231266023, |
| "tpp_threshold_100_total_metric": 0.19562502950429916, |
| "tpp_threshold_100_intended_diff_only": 0.4243000388145447, |
| "tpp_threshold_100_unintended_diff_only": 0.22867500931024554, |
| "tpp_threshold_500_total_metric": 0.13640003204345702, |
| "tpp_threshold_500_intended_diff_only": 0.42440004348754884, |
| "tpp_threshold_500_unintended_diff_only": 0.2880000114440918 |
| } |
| }, |
| "eval_result_details": [ |
| { |
| "dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results", |
| "tpp_threshold_2_total_metric": 0.12845000624656677, |
| "tpp_threshold_2_intended_diff_only": 0.203000009059906, |
| "tpp_threshold_2_unintended_diff_only": 0.07455000281333923, |
| "tpp_threshold_5_total_metric": 0.20644999742507936, |
| "tpp_threshold_5_intended_diff_only": 0.33280000686645506, |
| "tpp_threshold_5_unintended_diff_only": 0.12635000944137573, |
| "tpp_threshold_10_total_metric": 0.21114999651908875, |
| "tpp_threshold_10_intended_diff_only": 0.4152000069618225, |
| "tpp_threshold_10_unintended_diff_only": 0.20405001044273377, |
| "tpp_threshold_20_total_metric": 0.21230002641677856, |
| "tpp_threshold_20_intended_diff_only": 0.4556000351905823, |
| "tpp_threshold_20_unintended_diff_only": 0.2433000087738037, |
| "tpp_threshold_50_total_metric": 0.13145002126693725, |
| "tpp_threshold_50_intended_diff_only": 0.45940003395080564, |
| "tpp_threshold_50_unintended_diff_only": 0.3279500126838684, |
| "tpp_threshold_100_total_metric": 0.09655003249645233, |
| "tpp_threshold_100_intended_diff_only": 0.46060004234313967, |
| "tpp_threshold_100_unintended_diff_only": 0.36405000984668734, |
| "tpp_threshold_500_total_metric": 0.05490003228187561, |
| "tpp_threshold_500_intended_diff_only": 0.46060004234313967, |
| "tpp_threshold_500_unintended_diff_only": 0.40570001006126405 |
| }, |
| { |
| "dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results", |
| "tpp_threshold_2_total_metric": 0.026399993896484376, |
| "tpp_threshold_2_intended_diff_only": 0.03179999589920044, |
| "tpp_threshold_2_unintended_diff_only": 0.0054000020027160645, |
| "tpp_threshold_5_total_metric": 0.060150012373924255, |
| "tpp_threshold_5_intended_diff_only": 0.06900001764297485, |
| "tpp_threshold_5_unintended_diff_only": 0.008850005269050599, |
| "tpp_threshold_10_total_metric": 0.12245000004768372, |
| "tpp_threshold_10_intended_diff_only": 0.13760000467300415, |
| "tpp_threshold_10_unintended_diff_only": 0.015150004625320434, |
| "tpp_threshold_20_total_metric": 0.2576999992132187, |
| "tpp_threshold_20_intended_diff_only": 0.2912000060081482, |
| "tpp_threshold_20_unintended_diff_only": 0.0335000067949295, |
| "tpp_threshold_50_total_metric": 0.31620004773139954, |
| "tpp_threshold_50_intended_diff_only": 0.3784000396728516, |
| "tpp_threshold_50_unintended_diff_only": 0.062199991941452024, |
| "tpp_threshold_100_total_metric": 0.294700026512146, |
| "tpp_threshold_100_intended_diff_only": 0.38800003528594973, |
| "tpp_threshold_100_unintended_diff_only": 0.0933000087738037, |
| "tpp_threshold_500_total_metric": 0.21790003180503845, |
| "tpp_threshold_500_intended_diff_only": 0.388200044631958, |
| "tpp_threshold_500_unintended_diff_only": 0.17030001282691956 |
| } |
| ], |
| "sae_bench_commit_hash": "Unknown", |
| "sae_lens_id": "blocks.0.hook_resid_post", |
| "sae_lens_release_id": "gemma-2-2b-res-matryoshka-dc", |
| "sae_lens_version": "5.9.1", |
| "sae_cfg_dict": { |
| "architecture": "jumprelu", |
| "d_in": 2304, |
| "d_sae": 32768, |
| "activation_fn_str": "relu", |
| "apply_b_dec_to_input": true, |
| "finetuning_scaling_factor": false, |
| "context_size": 1024, |
| "model_name": "gemma-2-2b", |
| "hook_name": "blocks.0.hook_resid_post", |
| "hook_layer": 0, |
| "hook_head_index": null, |
| "prepend_bos": true, |
| "dataset_path": "chanind/pile-uncopyrighted-gemma-1024-abbrv-1B", |
| "dataset_trust_remote_code": true, |
| "normalize_activations": "none", |
| "dtype": "torch.bfloat16", |
| "device": "cuda", |
| "sae_lens_training_version": "5.5.1", |
| "activation_fn_kwargs": { |
| "k": 40 |
| }, |
| "neuronpedia_id": null, |
| "model_from_pretrained_kwargs": { |
| "center_writing_weights": false |
| }, |
| "seqpos_slice": [ |
| null |
| ] |
| }, |
| "eval_result_unstructured": { |
| "LabHC/bias_in_bios_class_set1": { |
| "0": { |
| "tpp_threshold_2_total_metric": 0.1482500284910202, |
| "tpp_threshold_2_intended_diff_only": 0.18400001525878906, |
| "tpp_threshold_2_unintended_diff_only": 0.03574998676776886, |
| "tpp_threshold_5_total_metric": 0.19300003349781036, |
| "tpp_threshold_5_intended_diff_only": 0.34400004148483276, |
| "tpp_threshold_5_unintended_diff_only": 0.1510000079870224, |
| "tpp_threshold_10_total_metric": 0.1950000375509262, |
| "tpp_threshold_10_intended_diff_only": 0.4050000309944153, |
| "tpp_threshold_10_unintended_diff_only": 0.20999999344348907, |
| "tpp_threshold_20_total_metric": 0.21350006759166718, |
| "tpp_threshold_20_intended_diff_only": 0.42500007152557373, |
| "tpp_threshold_20_unintended_diff_only": 0.21150000393390656, |
| "tpp_threshold_50_total_metric": 0.11500002443790436, |
| "tpp_threshold_50_intended_diff_only": 0.43300002813339233, |
| "tpp_threshold_50_unintended_diff_only": 0.318000003695488, |
| "tpp_threshold_100_total_metric": 0.07975007593631744, |
| "tpp_threshold_100_intended_diff_only": 0.43900007009506226, |
| "tpp_threshold_100_unintended_diff_only": 0.3592499941587448, |
| "tpp_threshold_500_total_metric": 0.035250067710876465, |
| "tpp_threshold_500_intended_diff_only": 0.43900007009506226, |
| "tpp_threshold_500_unintended_diff_only": 0.4037500023841858 |
| }, |
| "1": { |
| "tpp_threshold_2_total_metric": 0.1535000056028366, |
| "tpp_threshold_2_intended_diff_only": 0.3050000071525574, |
| "tpp_threshold_2_unintended_diff_only": 0.15150000154972076, |
| "tpp_threshold_5_total_metric": 0.16500000655651093, |
| "tpp_threshold_5_intended_diff_only": 0.3840000033378601, |
| "tpp_threshold_5_unintended_diff_only": 0.21899999678134918, |
| "tpp_threshold_10_total_metric": 0.14249996840953827, |
| "tpp_threshold_10_intended_diff_only": 0.4359999895095825, |
| "tpp_threshold_10_unintended_diff_only": 0.29350002110004425, |
| "tpp_threshold_20_total_metric": 0.14500004053115845, |
| "tpp_threshold_20_intended_diff_only": 0.45500004291534424, |
| "tpp_threshold_20_unintended_diff_only": 0.3100000023841858, |
| "tpp_threshold_50_total_metric": 0.11400002241134644, |
| "tpp_threshold_50_intended_diff_only": 0.45500004291534424, |
| "tpp_threshold_50_unintended_diff_only": 0.3410000205039978, |
| "tpp_threshold_100_total_metric": 0.08250004053115845, |
| "tpp_threshold_100_intended_diff_only": 0.45500004291534424, |
| "tpp_threshold_100_unintended_diff_only": 0.3725000023841858, |
| "tpp_threshold_500_total_metric": 0.04475003480911255, |
| "tpp_threshold_500_intended_diff_only": 0.45500004291534424, |
| "tpp_threshold_500_unintended_diff_only": 0.4102500081062317 |
| }, |
| "2": { |
| "tpp_threshold_2_total_metric": 0.014250010251998901, |
| "tpp_threshold_2_intended_diff_only": 0.022000014781951904, |
| "tpp_threshold_2_unintended_diff_only": 0.007750004529953003, |
| "tpp_threshold_5_total_metric": 0.25950001180171967, |
| "tpp_threshold_5_intended_diff_only": 0.2940000295639038, |
| "tpp_threshold_5_unintended_diff_only": 0.03450001776218414, |
| "tpp_threshold_10_total_metric": 0.2984999865293503, |
| "tpp_threshold_10_intended_diff_only": 0.41200000047683716, |
| "tpp_threshold_10_unintended_diff_only": 0.11350001394748688, |
| "tpp_threshold_20_total_metric": 0.2500000447034836, |
| "tpp_threshold_20_intended_diff_only": 0.44600003957748413, |
| "tpp_threshold_20_unintended_diff_only": 0.19599999487400055, |
| "tpp_threshold_50_total_metric": 0.12575000524520874, |
| "tpp_threshold_50_intended_diff_only": 0.45100003480911255, |
| "tpp_threshold_50_unintended_diff_only": 0.3252500295639038, |
| "tpp_threshold_100_total_metric": 0.10050003230571747, |
| "tpp_threshold_100_intended_diff_only": 0.45100003480911255, |
| "tpp_threshold_100_unintended_diff_only": 0.3505000025033951, |
| "tpp_threshold_500_total_metric": 0.04075002670288086, |
| "tpp_threshold_500_intended_diff_only": 0.45100003480911255, |
| "tpp_threshold_500_unintended_diff_only": 0.4102500081062317 |
| }, |
| "6": { |
| "tpp_threshold_2_total_metric": 0.17125000059604645, |
| "tpp_threshold_2_intended_diff_only": 0.23900002241134644, |
| "tpp_threshold_2_unintended_diff_only": 0.06775002181529999, |
| "tpp_threshold_5_total_metric": 0.22824998199939728, |
| "tpp_threshold_5_intended_diff_only": 0.33399999141693115, |
| "tpp_threshold_5_unintended_diff_only": 0.10575000941753387, |
| "tpp_threshold_10_total_metric": 0.2392500340938568, |
| "tpp_threshold_10_intended_diff_only": 0.437000036239624, |
| "tpp_threshold_10_unintended_diff_only": 0.1977500021457672, |
| "tpp_threshold_20_total_metric": 0.23374997079372406, |
| "tpp_threshold_20_intended_diff_only": 0.4819999933242798, |
| "tpp_threshold_20_unintended_diff_only": 0.24825002253055573, |
| "tpp_threshold_50_total_metric": 0.1612500250339508, |
| "tpp_threshold_50_intended_diff_only": 0.4880000352859497, |
| "tpp_threshold_50_unintended_diff_only": 0.3267500102519989, |
| "tpp_threshold_100_total_metric": 0.12975001335144043, |
| "tpp_threshold_100_intended_diff_only": 0.4880000352859497, |
| "tpp_threshold_100_unintended_diff_only": 0.3582500219345093, |
| "tpp_threshold_500_total_metric": 0.09075002372264862, |
| "tpp_threshold_500_intended_diff_only": 0.4880000352859497, |
| "tpp_threshold_500_unintended_diff_only": 0.3972500115633011 |
| }, |
| "9": { |
| "tpp_threshold_2_total_metric": 0.1549999862909317, |
| "tpp_threshold_2_intended_diff_only": 0.26499998569488525, |
| "tpp_threshold_2_unintended_diff_only": 0.10999999940395355, |
| "tpp_threshold_5_total_metric": 0.1864999532699585, |
| "tpp_threshold_5_intended_diff_only": 0.30799996852874756, |
| "tpp_threshold_5_unintended_diff_only": 0.12150001525878906, |
| "tpp_threshold_10_total_metric": 0.18049995601177216, |
| "tpp_threshold_10_intended_diff_only": 0.38599997758865356, |
| "tpp_threshold_10_unintended_diff_only": 0.2055000215768814, |
| "tpp_threshold_20_total_metric": 0.21925000846385956, |
| "tpp_threshold_20_intended_diff_only": 0.4700000286102295, |
| "tpp_threshold_20_unintended_diff_only": 0.25075002014636993, |
| "tpp_threshold_50_total_metric": 0.14125002920627594, |
| "tpp_threshold_50_intended_diff_only": 0.4700000286102295, |
| "tpp_threshold_50_unintended_diff_only": 0.32874999940395355, |
| "tpp_threshold_100_total_metric": 0.09025000035762787, |
| "tpp_threshold_100_intended_diff_only": 0.4700000286102295, |
| "tpp_threshold_100_unintended_diff_only": 0.3797500282526016, |
| "tpp_threshold_500_total_metric": 0.06300000846385956, |
| "tpp_threshold_500_intended_diff_only": 0.4700000286102295, |
| "tpp_threshold_500_unintended_diff_only": 0.40700002014636993 |
| } |
| }, |
| "canrager/amazon_reviews_mcauley_1and5": { |
| "1": { |
| "tpp_threshold_2_total_metric": 0.07124997675418854, |
| "tpp_threshold_2_intended_diff_only": 0.08499997854232788, |
| "tpp_threshold_2_unintended_diff_only": 0.013750001788139343, |
| "tpp_threshold_5_total_metric": 0.1507500410079956, |
| "tpp_threshold_5_intended_diff_only": 0.1730000376701355, |
| "tpp_threshold_5_unintended_diff_only": 0.022249996662139893, |
| "tpp_threshold_10_total_metric": 0.21025002002716064, |
| "tpp_threshold_10_intended_diff_only": 0.23000001907348633, |
| "tpp_threshold_10_unintended_diff_only": 0.019749999046325684, |
| "tpp_threshold_20_total_metric": 0.31974998116493225, |
| "tpp_threshold_20_intended_diff_only": 0.3709999918937683, |
| "tpp_threshold_20_unintended_diff_only": 0.05125001072883606, |
| "tpp_threshold_50_total_metric": 0.30125004053115845, |
| "tpp_threshold_50_intended_diff_only": 0.4270000457763672, |
| "tpp_threshold_50_unintended_diff_only": 0.12575000524520874, |
| "tpp_threshold_100_total_metric": 0.28800003230571747, |
| "tpp_threshold_100_intended_diff_only": 0.4270000457763672, |
| "tpp_threshold_100_unintended_diff_only": 0.13900001347064972, |
| "tpp_threshold_500_total_metric": 0.2107500284910202, |
| "tpp_threshold_500_intended_diff_only": 0.4270000457763672, |
| "tpp_threshold_500_unintended_diff_only": 0.21625001728534698 |
| }, |
| "2": { |
| "tpp_threshold_2_total_metric": 0.036750033497810364, |
| "tpp_threshold_2_intended_diff_only": 0.0350000262260437, |
| "tpp_threshold_2_unintended_diff_only": -0.0017500072717666626, |
| "tpp_threshold_5_total_metric": 0.058000028133392334, |
| "tpp_threshold_5_intended_diff_only": 0.06800001859664917, |
| "tpp_threshold_5_unintended_diff_only": 0.009999990463256836, |
| "tpp_threshold_10_total_metric": 0.16349999606609344, |
| "tpp_threshold_10_intended_diff_only": 0.18000000715255737, |
| "tpp_threshold_10_unintended_diff_only": 0.016500011086463928, |
| "tpp_threshold_20_total_metric": 0.30300000309944153, |
| "tpp_threshold_20_intended_diff_only": 0.34200000762939453, |
| "tpp_threshold_20_unintended_diff_only": 0.039000004529953, |
| "tpp_threshold_50_total_metric": 0.3450000733137131, |
| "tpp_threshold_50_intended_diff_only": 0.41200006008148193, |
| "tpp_threshold_50_unintended_diff_only": 0.06699998676776886, |
| "tpp_threshold_100_total_metric": 0.31400005519390106, |
| "tpp_threshold_100_intended_diff_only": 0.41200006008148193, |
| "tpp_threshold_100_unintended_diff_only": 0.09800000488758087, |
| "tpp_threshold_500_total_metric": 0.19600005447864532, |
| "tpp_threshold_500_intended_diff_only": 0.41200006008148193, |
| "tpp_threshold_500_unintended_diff_only": 0.2160000056028366 |
| }, |
| "3": { |
| "tpp_threshold_2_total_metric": -0.0007500648498535156, |
| "tpp_threshold_2_intended_diff_only": 0.007999956607818604, |
| "tpp_threshold_2_unintended_diff_only": 0.00875002145767212, |
| "tpp_threshold_5_total_metric": 0.04224996268749237, |
| "tpp_threshold_5_intended_diff_only": 0.04799997806549072, |
| "tpp_threshold_5_unintended_diff_only": 0.005750015377998352, |
| "tpp_threshold_10_total_metric": 0.07749998569488525, |
| "tpp_threshold_10_intended_diff_only": 0.08899998664855957, |
| "tpp_threshold_10_unintended_diff_only": 0.011500000953674316, |
| "tpp_threshold_20_total_metric": 0.20499996840953827, |
| "tpp_threshold_20_intended_diff_only": 0.24199998378753662, |
| "tpp_threshold_20_unintended_diff_only": 0.03700001537799835, |
| "tpp_threshold_50_total_metric": 0.3127499967813492, |
| "tpp_threshold_50_intended_diff_only": 0.3610000014305115, |
| "tpp_threshold_50_unintended_diff_only": 0.04825000464916229, |
| "tpp_threshold_100_total_metric": 0.315249964594841, |
| "tpp_threshold_100_intended_diff_only": 0.38599997758865356, |
| "tpp_threshold_100_unintended_diff_only": 0.07075001299381256, |
| "tpp_threshold_500_total_metric": 0.25300000607967377, |
| "tpp_threshold_500_intended_diff_only": 0.38700002431869507, |
| "tpp_threshold_500_unintended_diff_only": 0.1340000182390213 |
| }, |
| "5": { |
| "tpp_threshold_2_total_metric": 0.025999993085861206, |
| "tpp_threshold_2_intended_diff_only": 0.02799999713897705, |
| "tpp_threshold_2_unintended_diff_only": 0.0020000040531158447, |
| "tpp_threshold_5_total_metric": 0.033000022172927856, |
| "tpp_threshold_5_intended_diff_only": 0.03900003433227539, |
| "tpp_threshold_5_unintended_diff_only": 0.006000012159347534, |
| "tpp_threshold_10_total_metric": 0.06424999237060547, |
| "tpp_threshold_10_intended_diff_only": 0.07499998807907104, |
| "tpp_threshold_10_unintended_diff_only": 0.010749995708465576, |
| "tpp_threshold_20_total_metric": 0.28300005197525024, |
| "tpp_threshold_20_intended_diff_only": 0.3020000457763672, |
| "tpp_threshold_20_unintended_diff_only": 0.018999993801116943, |
| "tpp_threshold_50_total_metric": 0.33500005304813385, |
| "tpp_threshold_50_intended_diff_only": 0.3720000386238098, |
| "tpp_threshold_50_unintended_diff_only": 0.036999985575675964, |
| "tpp_threshold_100_total_metric": 0.2912500351667404, |
| "tpp_threshold_100_intended_diff_only": 0.39000004529953003, |
| "tpp_threshold_100_unintended_diff_only": 0.09875001013278961, |
| "tpp_threshold_500_total_metric": 0.25075003504753113, |
| "tpp_threshold_500_intended_diff_only": 0.39000004529953003, |
| "tpp_threshold_500_unintended_diff_only": 0.1392500102519989 |
| }, |
| "6": { |
| "tpp_threshold_2_total_metric": -0.0012499690055847168, |
| "tpp_threshold_2_intended_diff_only": 0.003000020980834961, |
| "tpp_threshold_2_unintended_diff_only": 0.004249989986419678, |
| "tpp_threshold_5_total_metric": 0.01675000786781311, |
| "tpp_threshold_5_intended_diff_only": 0.017000019550323486, |
| "tpp_threshold_5_unintended_diff_only": 0.000250011682510376, |
| "tpp_threshold_10_total_metric": 0.09675000607967377, |
| "tpp_threshold_10_intended_diff_only": 0.11400002241134644, |
| "tpp_threshold_10_unintended_diff_only": 0.01725001633167267, |
| "tpp_threshold_20_total_metric": 0.17774999141693115, |
| "tpp_threshold_20_intended_diff_only": 0.19900000095367432, |
| "tpp_threshold_20_unintended_diff_only": 0.021250009536743164, |
| "tpp_threshold_50_total_metric": 0.2870000749826431, |
| "tpp_threshold_50_intended_diff_only": 0.3200000524520874, |
| "tpp_threshold_50_unintended_diff_only": 0.032999977469444275, |
| "tpp_threshold_100_total_metric": 0.26500004529953003, |
| "tpp_threshold_100_intended_diff_only": 0.3250000476837158, |
| "tpp_threshold_100_unintended_diff_only": 0.06000000238418579, |
| "tpp_threshold_500_total_metric": 0.17900003492832184, |
| "tpp_threshold_500_intended_diff_only": 0.3250000476837158, |
| "tpp_threshold_500_unintended_diff_only": 0.14600001275539398 |
| } |
| } |
| } |
| } |