| { |
| "eval_type_id": "tpp", |
| "eval_config": { |
| "random_seed": 42, |
| "dataset_names": [ |
| "LabHC/bias_in_bios_class_set1", |
| "canrager/amazon_reviews_mcauley_1and5" |
| ], |
| "perform_scr": false, |
| "early_stopping_patience": 20, |
| "train_set_size": 4000, |
| "test_set_size": 1000, |
| "context_length": 128, |
| "probe_train_batch_size": 16, |
| "probe_test_batch_size": 500, |
| "probe_epochs": 20, |
| "probe_lr": 0.001, |
| "probe_l1_penalty": 0.001, |
| "sae_batch_size": 125, |
| "llm_batch_size": 32, |
| "llm_dtype": "bfloat16", |
| "lower_vram_usage": false, |
| "model_name": "gemma-2-2b", |
| "n_values": [ |
| 2, |
| 5, |
| 10, |
| 20, |
| 50, |
| 100, |
| 500 |
| ], |
| "column1_vals_lookup": { |
| "LabHC/bias_in_bios_class_set1": [ |
| [ |
| "professor", |
| "nurse" |
| ], |
| [ |
| "architect", |
| "journalist" |
| ], |
| [ |
| "surgeon", |
| "psychologist" |
| ], |
| [ |
| "attorney", |
| "teacher" |
| ] |
| ], |
| "canrager/amazon_reviews_mcauley_1and5": [ |
| [ |
| "Books", |
| "CDs_and_Vinyl" |
| ], |
| [ |
| "Software", |
| "Electronics" |
| ], |
| [ |
| "Pet_Supplies", |
| "Office_Products" |
| ], |
| [ |
| "Industrial_and_Scientific", |
| "Toys_and_Games" |
| ] |
| ] |
| } |
| }, |
| "eval_id": "65b37170-42a3-4f63-bfdc-b4728a912b3a", |
| "datetime_epoch_millis": 1745617399138, |
| "eval_result_metrics": { |
| "tpp_metrics": { |
| "tpp_threshold_2_total_metric": 0.08745000213384628, |
| "tpp_threshold_2_intended_diff_only": 0.14260000586509705, |
| "tpp_threshold_2_unintended_diff_only": 0.055150003731250764, |
| "tpp_threshold_5_total_metric": 0.11192499995231628, |
| "tpp_threshold_5_intended_diff_only": 0.19540000557899473, |
| "tpp_threshold_5_unintended_diff_only": 0.08347500562667846, |
| "tpp_threshold_10_total_metric": 0.1195999950170517, |
| "tpp_threshold_10_intended_diff_only": 0.23760000467300416, |
| "tpp_threshold_10_unintended_diff_only": 0.11800000965595245, |
| "tpp_threshold_20_total_metric": 0.14760000407695772, |
| "tpp_threshold_20_intended_diff_only": 0.29660001397132874, |
| "tpp_threshold_20_unintended_diff_only": 0.14900000989437104, |
| "tpp_threshold_50_total_metric": 0.1983000248670578, |
| "tpp_threshold_50_intended_diff_only": 0.3852000296115875, |
| "tpp_threshold_50_unintended_diff_only": 0.18690000474452972, |
| "tpp_threshold_100_total_metric": 0.21795002818107606, |
| "tpp_threshold_100_intended_diff_only": 0.4261000335216522, |
| "tpp_threshold_100_unintended_diff_only": 0.20815000534057618, |
| "tpp_threshold_500_total_metric": 0.18197503536939622, |
| "tpp_threshold_500_intended_diff_only": 0.43440004587173464, |
| "tpp_threshold_500_unintended_diff_only": 0.2524250105023384 |
| } |
| }, |
| "eval_result_details": [ |
| { |
| "dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results", |
| "tpp_threshold_2_total_metric": 0.15685000121593476, |
| "tpp_threshold_2_intended_diff_only": 0.2628000020980835, |
| "tpp_threshold_2_unintended_diff_only": 0.10595000088214875, |
| "tpp_threshold_5_total_metric": 0.17820000648498535, |
| "tpp_threshold_5_intended_diff_only": 0.3382000088691711, |
| "tpp_threshold_5_unintended_diff_only": 0.1600000023841858, |
| "tpp_threshold_10_total_metric": 0.15974999964237213, |
| "tpp_threshold_10_intended_diff_only": 0.38260000944137573, |
| "tpp_threshold_10_unintended_diff_only": 0.2228500097990036, |
| "tpp_threshold_20_total_metric": 0.14375000894069673, |
| "tpp_threshold_20_intended_diff_only": 0.4234000205993652, |
| "tpp_threshold_20_unintended_diff_only": 0.27965001165866854, |
| "tpp_threshold_50_total_metric": 0.10650001466274261, |
| "tpp_threshold_50_intended_diff_only": 0.45120002031326295, |
| "tpp_threshold_50_unintended_diff_only": 0.34470000565052034, |
| "tpp_threshold_100_total_metric": 0.08845002353191375, |
| "tpp_threshold_100_intended_diff_only": 0.4640000343322754, |
| "tpp_threshold_100_unintended_diff_only": 0.37555001080036166, |
| "tpp_threshold_500_total_metric": 0.05925002694129944, |
| "tpp_threshold_500_intended_diff_only": 0.46620004177093505, |
| "tpp_threshold_500_unintended_diff_only": 0.4069500148296356 |
| }, |
| { |
| "dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results", |
| "tpp_threshold_2_total_metric": 0.018050003051757812, |
| "tpp_threshold_2_intended_diff_only": 0.022400009632110595, |
| "tpp_threshold_2_unintended_diff_only": 0.004350006580352783, |
| "tpp_threshold_5_total_metric": 0.04564999341964722, |
| "tpp_threshold_5_intended_diff_only": 0.05260000228881836, |
| "tpp_threshold_5_unintended_diff_only": 0.006950008869171143, |
| "tpp_threshold_10_total_metric": 0.07944999039173126, |
| "tpp_threshold_10_intended_diff_only": 0.09259999990463257, |
| "tpp_threshold_10_unintended_diff_only": 0.013150009512901305, |
| "tpp_threshold_20_total_metric": 0.1514499992132187, |
| "tpp_threshold_20_intended_diff_only": 0.16980000734329223, |
| "tpp_threshold_20_unintended_diff_only": 0.018350008130073547, |
| "tpp_threshold_50_total_metric": 0.290100035071373, |
| "tpp_threshold_50_intended_diff_only": 0.3192000389099121, |
| "tpp_threshold_50_unintended_diff_only": 0.029100003838539123, |
| "tpp_threshold_100_total_metric": 0.34745003283023834, |
| "tpp_threshold_100_intended_diff_only": 0.38820003271102904, |
| "tpp_threshold_100_unintended_diff_only": 0.04074999988079071, |
| "tpp_threshold_500_total_metric": 0.304700043797493, |
| "tpp_threshold_500_intended_diff_only": 0.4026000499725342, |
| "tpp_threshold_500_unintended_diff_only": 0.0979000061750412 |
| } |
| ], |
| "sae_bench_commit_hash": "Unknown", |
| "sae_lens_id": "blocks.5.hook_resid_post", |
| "sae_lens_release_id": "gemma-2-2b-res-snap-matryoshka-dc", |
| "sae_lens_version": "5.9.1", |
| "sae_cfg_dict": { |
| "architecture": "jumprelu", |
| "d_in": 2304, |
| "d_sae": 32768, |
| "activation_fn_str": "relu", |
| "apply_b_dec_to_input": true, |
| "finetuning_scaling_factor": false, |
| "context_size": 1024, |
| "model_name": "gemma-2-2b", |
| "hook_name": "blocks.5.hook_resid_post", |
| "hook_layer": 5, |
| "hook_head_index": null, |
| "prepend_bos": true, |
| "dataset_path": "chanind/pile-uncopyrighted-gemma-1024-abbrv-1B", |
| "dataset_trust_remote_code": true, |
| "normalize_activations": "none", |
| "dtype": "torch.bfloat16", |
| "device": "cuda", |
| "sae_lens_training_version": "5.5.1", |
| "activation_fn_kwargs": { |
| "k": 40 |
| }, |
| "neuronpedia_id": null, |
| "model_from_pretrained_kwargs": { |
| "center_writing_weights": false |
| }, |
| "seqpos_slice": [ |
| null |
| ] |
| }, |
| "eval_result_unstructured": { |
| "LabHC/bias_in_bios_class_set1": { |
| "0": { |
| "tpp_threshold_2_total_metric": 0.2097499966621399, |
| "tpp_threshold_2_intended_diff_only": 0.3100000023841858, |
| "tpp_threshold_2_unintended_diff_only": 0.1002500057220459, |
| "tpp_threshold_5_total_metric": 0.2097500115633011, |
| "tpp_threshold_5_intended_diff_only": 0.36000001430511475, |
| "tpp_threshold_5_unintended_diff_only": 0.15025000274181366, |
| "tpp_threshold_10_total_metric": 0.13224998116493225, |
| "tpp_threshold_10_intended_diff_only": 0.4039999842643738, |
| "tpp_threshold_10_unintended_diff_only": 0.27175000309944153, |
| "tpp_threshold_20_total_metric": 0.09399999678134918, |
| "tpp_threshold_20_intended_diff_only": 0.42100000381469727, |
| "tpp_threshold_20_unintended_diff_only": 0.3270000070333481, |
| "tpp_threshold_50_total_metric": 0.07200002670288086, |
| "tpp_threshold_50_intended_diff_only": 0.437000036239624, |
| "tpp_threshold_50_unintended_diff_only": 0.36500000953674316, |
| "tpp_threshold_100_total_metric": 0.06349998712539673, |
| "tpp_threshold_100_intended_diff_only": 0.4490000009536743, |
| "tpp_threshold_100_unintended_diff_only": 0.3855000138282776, |
| "tpp_threshold_500_total_metric": 0.04050002992153168, |
| "tpp_threshold_500_intended_diff_only": 0.45100003480911255, |
| "tpp_threshold_500_unintended_diff_only": 0.41050000488758087 |
| }, |
| "1": { |
| "tpp_threshold_2_total_metric": 0.12874995172023773, |
| "tpp_threshold_2_intended_diff_only": 0.2799999713897705, |
| "tpp_threshold_2_unintended_diff_only": 0.15125001966953278, |
| "tpp_threshold_5_total_metric": 0.14374996721744537, |
| "tpp_threshold_5_intended_diff_only": 0.32499998807907104, |
| "tpp_threshold_5_unintended_diff_only": 0.18125002086162567, |
| "tpp_threshold_10_total_metric": 0.12749998271465302, |
| "tpp_threshold_10_intended_diff_only": 0.3610000014305115, |
| "tpp_threshold_10_unintended_diff_only": 0.23350001871585846, |
| "tpp_threshold_20_total_metric": 0.14549997448921204, |
| "tpp_threshold_20_intended_diff_only": 0.4359999895095825, |
| "tpp_threshold_20_unintended_diff_only": 0.2905000150203705, |
| "tpp_threshold_50_total_metric": 0.109499990940094, |
| "tpp_threshold_50_intended_diff_only": 0.4580000042915344, |
| "tpp_threshold_50_unintended_diff_only": 0.34850001335144043, |
| "tpp_threshold_100_total_metric": 0.07125000655651093, |
| "tpp_threshold_100_intended_diff_only": 0.4610000252723694, |
| "tpp_threshold_100_unintended_diff_only": 0.38975001871585846, |
| "tpp_threshold_500_total_metric": 0.04625000059604645, |
| "tpp_threshold_500_intended_diff_only": 0.4610000252723694, |
| "tpp_threshold_500_unintended_diff_only": 0.41475002467632294 |
| }, |
| "2": { |
| "tpp_threshold_2_total_metric": 0.098750039935112, |
| "tpp_threshold_2_intended_diff_only": 0.12400001287460327, |
| "tpp_threshold_2_unintended_diff_only": 0.025249972939491272, |
| "tpp_threshold_5_total_metric": 0.1875000298023224, |
| "tpp_threshold_5_intended_diff_only": 0.2670000195503235, |
| "tpp_threshold_5_unintended_diff_only": 0.0794999897480011, |
| "tpp_threshold_10_total_metric": 0.19975003600120544, |
| "tpp_threshold_10_intended_diff_only": 0.3070000410079956, |
| "tpp_threshold_10_unintended_diff_only": 0.10725000500679016, |
| "tpp_threshold_20_total_metric": 0.13975003361701965, |
| "tpp_threshold_20_intended_diff_only": 0.359000027179718, |
| "tpp_threshold_20_unintended_diff_only": 0.21924999356269836, |
| "tpp_threshold_50_total_metric": 0.1052500307559967, |
| "tpp_threshold_50_intended_diff_only": 0.4010000228881836, |
| "tpp_threshold_50_unintended_diff_only": 0.2957499921321869, |
| "tpp_threshold_100_total_metric": 0.0907500684261322, |
| "tpp_threshold_100_intended_diff_only": 0.4440000653266907, |
| "tpp_threshold_100_unintended_diff_only": 0.35324999690055847, |
| "tpp_threshold_500_total_metric": 0.037750065326690674, |
| "tpp_threshold_500_intended_diff_only": 0.4530000686645508, |
| "tpp_threshold_500_unintended_diff_only": 0.4152500033378601 |
| }, |
| "6": { |
| "tpp_threshold_2_total_metric": 0.20900003612041473, |
| "tpp_threshold_2_intended_diff_only": 0.31700003147125244, |
| "tpp_threshold_2_unintended_diff_only": 0.10799999535083771, |
| "tpp_threshold_5_total_metric": 0.20850004255771637, |
| "tpp_threshold_5_intended_diff_only": 0.39100003242492676, |
| "tpp_threshold_5_unintended_diff_only": 0.1824999898672104, |
| "tpp_threshold_10_total_metric": 0.21850000321865082, |
| "tpp_threshold_10_intended_diff_only": 0.4580000042915344, |
| "tpp_threshold_10_unintended_diff_only": 0.2395000010728836, |
| "tpp_threshold_20_total_metric": 0.1940000355243683, |
| "tpp_threshold_20_intended_diff_only": 0.47700005769729614, |
| "tpp_threshold_20_unintended_diff_only": 0.28300002217292786, |
| "tpp_threshold_50_total_metric": 0.1250000149011612, |
| "tpp_threshold_50_intended_diff_only": 0.48500001430511475, |
| "tpp_threshold_50_unintended_diff_only": 0.35999999940395355, |
| "tpp_threshold_100_total_metric": 0.12175005674362183, |
| "tpp_threshold_100_intended_diff_only": 0.49100005626678467, |
| "tpp_threshold_100_unintended_diff_only": 0.36924999952316284, |
| "tpp_threshold_500_total_metric": 0.09475003182888031, |
| "tpp_threshold_500_intended_diff_only": 0.49100005626678467, |
| "tpp_threshold_500_unintended_diff_only": 0.39625002443790436 |
| }, |
| "9": { |
| "tpp_threshold_2_total_metric": 0.1379999816417694, |
| "tpp_threshold_2_intended_diff_only": 0.28299999237060547, |
| "tpp_threshold_2_unintended_diff_only": 0.14500001072883606, |
| "tpp_threshold_5_total_metric": 0.14149998128414154, |
| "tpp_threshold_5_intended_diff_only": 0.3479999899864197, |
| "tpp_threshold_5_unintended_diff_only": 0.20650000870227814, |
| "tpp_threshold_10_total_metric": 0.12074999511241913, |
| "tpp_threshold_10_intended_diff_only": 0.3830000162124634, |
| "tpp_threshold_10_unintended_diff_only": 0.26225002110004425, |
| "tpp_threshold_20_total_metric": 0.14550000429153442, |
| "tpp_threshold_20_intended_diff_only": 0.4240000247955322, |
| "tpp_threshold_20_unintended_diff_only": 0.2785000205039978, |
| "tpp_threshold_50_total_metric": 0.12075001001358032, |
| "tpp_threshold_50_intended_diff_only": 0.4750000238418579, |
| "tpp_threshold_50_unintended_diff_only": 0.3542500138282776, |
| "tpp_threshold_100_total_metric": 0.0949999988079071, |
| "tpp_threshold_100_intended_diff_only": 0.4750000238418579, |
| "tpp_threshold_100_unintended_diff_only": 0.3800000250339508, |
| "tpp_threshold_500_total_metric": 0.07700000703334808, |
| "tpp_threshold_500_intended_diff_only": 0.4750000238418579, |
| "tpp_threshold_500_unintended_diff_only": 0.3980000168085098 |
| } |
| }, |
| "canrager/amazon_reviews_mcauley_1and5": { |
| "1": { |
| "tpp_threshold_2_total_metric": 0.011499986052513123, |
| "tpp_threshold_2_intended_diff_only": 0.013999998569488525, |
| "tpp_threshold_2_unintended_diff_only": 0.002500012516975403, |
| "tpp_threshold_5_total_metric": 0.05824996531009674, |
| "tpp_threshold_5_intended_diff_only": 0.06699997186660767, |
| "tpp_threshold_5_unintended_diff_only": 0.008750006556510925, |
| "tpp_threshold_10_total_metric": 0.09949997067451477, |
| "tpp_threshold_10_intended_diff_only": 0.11699998378753662, |
| "tpp_threshold_10_unintended_diff_only": 0.01750001311302185, |
| "tpp_threshold_20_total_metric": 0.16199998557567596, |
| "tpp_threshold_20_intended_diff_only": 0.1850000023841858, |
| "tpp_threshold_20_unintended_diff_only": 0.023000016808509827, |
| "tpp_threshold_50_total_metric": 0.3462500125169754, |
| "tpp_threshold_50_intended_diff_only": 0.36900001764297485, |
| "tpp_threshold_50_unintended_diff_only": 0.02275000512599945, |
| "tpp_threshold_100_total_metric": 0.3890000283718109, |
| "tpp_threshold_100_intended_diff_only": 0.4230000376701355, |
| "tpp_threshold_100_unintended_diff_only": 0.034000009298324585, |
| "tpp_threshold_500_total_metric": 0.35325002670288086, |
| "tpp_threshold_500_intended_diff_only": 0.437000036239624, |
| "tpp_threshold_500_unintended_diff_only": 0.08375000953674316 |
| }, |
| "2": { |
| "tpp_threshold_2_total_metric": 0.024750009179115295, |
| "tpp_threshold_2_intended_diff_only": 0.03200000524520874, |
| "tpp_threshold_2_unintended_diff_only": 0.007249996066093445, |
| "tpp_threshold_5_total_metric": 0.054499998688697815, |
| "tpp_threshold_5_intended_diff_only": 0.06499999761581421, |
| "tpp_threshold_5_unintended_diff_only": 0.010499998927116394, |
| "tpp_threshold_10_total_metric": 0.07225000858306885, |
| "tpp_threshold_10_intended_diff_only": 0.0820000171661377, |
| "tpp_threshold_10_unintended_diff_only": 0.009750008583068848, |
| "tpp_threshold_20_total_metric": 0.16825000941753387, |
| "tpp_threshold_20_intended_diff_only": 0.18400001525878906, |
| "tpp_threshold_20_unintended_diff_only": 0.015750005841255188, |
| "tpp_threshold_50_total_metric": 0.3190000653266907, |
| "tpp_threshold_50_intended_diff_only": 0.3470000624656677, |
| "tpp_threshold_50_unintended_diff_only": 0.02799999713897705, |
| "tpp_threshold_100_total_metric": 0.3695000559091568, |
| "tpp_threshold_100_intended_diff_only": 0.4180000424385071, |
| "tpp_threshold_100_unintended_diff_only": 0.04849998652935028, |
| "tpp_threshold_500_total_metric": 0.315000057220459, |
| "tpp_threshold_500_intended_diff_only": 0.42100006341934204, |
| "tpp_threshold_500_unintended_diff_only": 0.10600000619888306 |
| }, |
| "3": { |
| "tpp_threshold_2_total_metric": 0.01225002110004425, |
| "tpp_threshold_2_intended_diff_only": 0.00700002908706665, |
| "tpp_threshold_2_unintended_diff_only": -0.0052499920129776, |
| "tpp_threshold_5_total_metric": 0.01150001585483551, |
| "tpp_threshold_5_intended_diff_only": 0.016000032424926758, |
| "tpp_threshold_5_unintended_diff_only": 0.0045000165700912476, |
| "tpp_threshold_10_total_metric": 0.062000006437301636, |
| "tpp_threshold_10_intended_diff_only": 0.078000009059906, |
| "tpp_threshold_10_unintended_diff_only": 0.01600000262260437, |
| "tpp_threshold_20_total_metric": 0.13950000703334808, |
| "tpp_threshold_20_intended_diff_only": 0.16600000858306885, |
| "tpp_threshold_20_unintended_diff_only": 0.026500001549720764, |
| "tpp_threshold_50_total_metric": 0.26325003802776337, |
| "tpp_threshold_50_intended_diff_only": 0.3070000410079956, |
| "tpp_threshold_50_unintended_diff_only": 0.04375000298023224, |
| "tpp_threshold_100_total_metric": 0.31575001776218414, |
| "tpp_threshold_100_intended_diff_only": 0.37300002574920654, |
| "tpp_threshold_100_unintended_diff_only": 0.0572500079870224, |
| "tpp_threshold_500_total_metric": 0.28175005316734314, |
| "tpp_threshold_500_intended_diff_only": 0.3980000615119934, |
| "tpp_threshold_500_unintended_diff_only": 0.11625000834465027 |
| }, |
| "5": { |
| "tpp_threshold_2_total_metric": 0.005250036716461182, |
| "tpp_threshold_2_intended_diff_only": 0.016000032424926758, |
| "tpp_threshold_2_unintended_diff_only": 0.010749995708465576, |
| "tpp_threshold_5_total_metric": 0.038750022649765015, |
| "tpp_threshold_5_intended_diff_only": 0.04400002956390381, |
| "tpp_threshold_5_unintended_diff_only": 0.005250006914138794, |
| "tpp_threshold_10_total_metric": 0.07725000381469727, |
| "tpp_threshold_10_intended_diff_only": 0.08700001239776611, |
| "tpp_threshold_10_unintended_diff_only": 0.009750008583068848, |
| "tpp_threshold_20_total_metric": 0.12125000357627869, |
| "tpp_threshold_20_intended_diff_only": 0.13300001621246338, |
| "tpp_threshold_20_unintended_diff_only": 0.011750012636184692, |
| "tpp_threshold_50_total_metric": 0.25850003957748413, |
| "tpp_threshold_50_intended_diff_only": 0.27900004386901855, |
| "tpp_threshold_50_unintended_diff_only": 0.020500004291534424, |
| "tpp_threshold_100_total_metric": 0.3602500259876251, |
| "tpp_threshold_100_intended_diff_only": 0.37800002098083496, |
| "tpp_threshold_100_unintended_diff_only": 0.01774999499320984, |
| "tpp_threshold_500_total_metric": 0.3042500615119934, |
| "tpp_threshold_500_intended_diff_only": 0.4030000567436218, |
| "tpp_threshold_500_unintended_diff_only": 0.09874999523162842 |
| }, |
| "6": { |
| "tpp_threshold_2_total_metric": 0.03649996221065521, |
| "tpp_threshold_2_intended_diff_only": 0.042999982833862305, |
| "tpp_threshold_2_unintended_diff_only": 0.006500020623207092, |
| "tpp_threshold_5_total_metric": 0.065249964594841, |
| "tpp_threshold_5_intended_diff_only": 0.07099997997283936, |
| "tpp_threshold_5_unintended_diff_only": 0.005750015377998352, |
| "tpp_threshold_10_total_metric": 0.08624996244907379, |
| "tpp_threshold_10_intended_diff_only": 0.0989999771118164, |
| "tpp_threshold_10_unintended_diff_only": 0.012750014662742615, |
| "tpp_threshold_20_total_metric": 0.16624999046325684, |
| "tpp_threshold_20_intended_diff_only": 0.1809999942779541, |
| "tpp_threshold_20_unintended_diff_only": 0.014750003814697266, |
| "tpp_threshold_50_total_metric": 0.26350001990795135, |
| "tpp_threshold_50_intended_diff_only": 0.2940000295639038, |
| "tpp_threshold_50_unintended_diff_only": 0.030500009655952454, |
| "tpp_threshold_100_total_metric": 0.30275003612041473, |
| "tpp_threshold_100_intended_diff_only": 0.3490000367164612, |
| "tpp_threshold_100_unintended_diff_only": 0.04625000059604645, |
| "tpp_threshold_500_total_metric": 0.2692500203847885, |
| "tpp_threshold_500_intended_diff_only": 0.3540000319480896, |
| "tpp_threshold_500_unintended_diff_only": 0.08475001156330109 |
| } |
| } |
| } |
| } |