| { |
| "eval_type_id": "tpp", |
| "eval_config": { |
| "random_seed": 42, |
| "dataset_names": [ |
| "LabHC/bias_in_bios_class_set1", |
| "canrager/amazon_reviews_mcauley_1and5" |
| ], |
| "perform_scr": false, |
| "early_stopping_patience": 20, |
| "train_set_size": 4000, |
| "test_set_size": 1000, |
| "context_length": 128, |
| "probe_train_batch_size": 16, |
| "probe_test_batch_size": 500, |
| "probe_epochs": 20, |
| "probe_lr": 0.001, |
| "probe_l1_penalty": 0.001, |
| "sae_batch_size": 125, |
| "llm_batch_size": 32, |
| "llm_dtype": "bfloat16", |
| "lower_vram_usage": false, |
| "model_name": "gemma-2-2b", |
| "n_values": [ |
| 2, |
| 5, |
| 10, |
| 20, |
| 50, |
| 100, |
| 500 |
| ], |
| "column1_vals_lookup": { |
| "LabHC/bias_in_bios_class_set1": [ |
| [ |
| "professor", |
| "nurse" |
| ], |
| [ |
| "architect", |
| "journalist" |
| ], |
| [ |
| "surgeon", |
| "psychologist" |
| ], |
| [ |
| "attorney", |
| "teacher" |
| ] |
| ], |
| "canrager/amazon_reviews_mcauley_1and5": [ |
| [ |
| "Books", |
| "CDs_and_Vinyl" |
| ], |
| [ |
| "Software", |
| "Electronics" |
| ], |
| [ |
| "Pet_Supplies", |
| "Office_Products" |
| ], |
| [ |
| "Industrial_and_Scientific", |
| "Toys_and_Games" |
| ] |
| ] |
| } |
| }, |
| "eval_id": "65b37170-42a3-4f63-bfdc-b4728a912b3a", |
| "datetime_epoch_millis": 1745616913793, |
| "eval_result_metrics": { |
| "tpp_metrics": { |
| "tpp_threshold_2_total_metric": 0.03630000203847885, |
| "tpp_threshold_2_intended_diff_only": 0.0662000060081482, |
| "tpp_threshold_2_unintended_diff_only": 0.029900003969669343, |
| "tpp_threshold_5_total_metric": 0.08457500338554381, |
| "tpp_threshold_5_intended_diff_only": 0.15630000829696655, |
| "tpp_threshold_5_unintended_diff_only": 0.07172500491142272, |
| "tpp_threshold_10_total_metric": 0.11277500689029694, |
| "tpp_threshold_10_intended_diff_only": 0.21140000820159913, |
| "tpp_threshold_10_unintended_diff_only": 0.09862500131130218, |
| "tpp_threshold_20_total_metric": 0.1531750127673149, |
| "tpp_threshold_20_intended_diff_only": 0.2742000162601471, |
| "tpp_threshold_20_unintended_diff_only": 0.12102500349283218, |
| "tpp_threshold_50_total_metric": 0.21600002348423003, |
| "tpp_threshold_50_intended_diff_only": 0.38650002479553225, |
| "tpp_threshold_50_unintended_diff_only": 0.1705000013113022, |
| "tpp_threshold_100_total_metric": 0.2225750133395195, |
| "tpp_threshold_100_intended_diff_only": 0.4223000228404999, |
| "tpp_threshold_100_unintended_diff_only": 0.19972500950098038, |
| "tpp_threshold_500_total_metric": 0.1843750312924385, |
| "tpp_threshold_500_intended_diff_only": 0.43320004343986507, |
| "tpp_threshold_500_unintended_diff_only": 0.2488250121474266 |
| } |
| }, |
| "eval_result_details": [ |
| { |
| "dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results", |
| "tpp_threshold_2_total_metric": 0.06900000274181366, |
| "tpp_threshold_2_intended_diff_only": 0.12720000743865967, |
| "tpp_threshold_2_unintended_diff_only": 0.05820000469684601, |
| "tpp_threshold_5_total_metric": 0.15985000431537627, |
| "tpp_threshold_5_intended_diff_only": 0.29940000772476194, |
| "tpp_threshold_5_unintended_diff_only": 0.13955000340938567, |
| "tpp_threshold_10_total_metric": 0.19015000462532045, |
| "tpp_threshold_10_intended_diff_only": 0.37820000648498536, |
| "tpp_threshold_10_unintended_diff_only": 0.18805000185966492, |
| "tpp_threshold_20_total_metric": 0.20135002434253693, |
| "tpp_threshold_20_intended_diff_only": 0.42460002899169924, |
| "tpp_threshold_20_unintended_diff_only": 0.22325000464916228, |
| "tpp_threshold_50_total_metric": 0.1459500253200531, |
| "tpp_threshold_50_intended_diff_only": 0.457800030708313, |
| "tpp_threshold_50_unintended_diff_only": 0.3118500053882599, |
| "tpp_threshold_100_total_metric": 0.10880002081394195, |
| "tpp_threshold_100_intended_diff_only": 0.4624000310897827, |
| "tpp_threshold_100_unintended_diff_only": 0.35360001027584076, |
| "tpp_threshold_500_total_metric": 0.06760002970695496, |
| "tpp_threshold_500_intended_diff_only": 0.462600040435791, |
| "tpp_threshold_500_unintended_diff_only": 0.39500001072883606 |
| }, |
| { |
| "dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results", |
| "tpp_threshold_2_total_metric": 0.003600001335144043, |
| "tpp_threshold_2_intended_diff_only": 0.005200004577636719, |
| "tpp_threshold_2_unintended_diff_only": 0.0016000032424926757, |
| "tpp_threshold_5_total_metric": 0.009300002455711364, |
| "tpp_threshold_5_intended_diff_only": 0.013200008869171142, |
| "tpp_threshold_5_unintended_diff_only": 0.003900006413459778, |
| "tpp_threshold_10_total_metric": 0.035400009155273436, |
| "tpp_threshold_10_intended_diff_only": 0.04460000991821289, |
| "tpp_threshold_10_unintended_diff_only": 0.009200000762939453, |
| "tpp_threshold_20_total_metric": 0.10500000119209289, |
| "tpp_threshold_20_intended_diff_only": 0.12380000352859497, |
| "tpp_threshold_20_unintended_diff_only": 0.018800002336502076, |
| "tpp_threshold_50_total_metric": 0.286050021648407, |
| "tpp_threshold_50_intended_diff_only": 0.31520001888275145, |
| "tpp_threshold_50_unintended_diff_only": 0.02914999723434448, |
| "tpp_threshold_100_total_metric": 0.33635000586509706, |
| "tpp_threshold_100_intended_diff_only": 0.38220001459121705, |
| "tpp_threshold_100_unintended_diff_only": 0.045850008726119995, |
| "tpp_threshold_500_total_metric": 0.3011500328779221, |
| "tpp_threshold_500_intended_diff_only": 0.4038000464439392, |
| "tpp_threshold_500_unintended_diff_only": 0.10265001356601715 |
| } |
| ], |
| "sae_bench_commit_hash": "Unknown", |
| "sae_lens_id": "blocks.2.hook_resid_post", |
| "sae_lens_release_id": "gemma-2-2b-res-snap-matryoshka-dc", |
| "sae_lens_version": "5.9.1", |
| "sae_cfg_dict": { |
| "architecture": "jumprelu", |
| "d_in": 2304, |
| "d_sae": 32768, |
| "activation_fn_str": "relu", |
| "apply_b_dec_to_input": true, |
| "finetuning_scaling_factor": false, |
| "context_size": 1024, |
| "model_name": "gemma-2-2b", |
| "hook_name": "blocks.2.hook_resid_post", |
| "hook_layer": 2, |
| "hook_head_index": null, |
| "prepend_bos": true, |
| "dataset_path": "chanind/pile-uncopyrighted-gemma-1024-abbrv-1B", |
| "dataset_trust_remote_code": true, |
| "normalize_activations": "none", |
| "dtype": "torch.bfloat16", |
| "device": "cuda", |
| "sae_lens_training_version": "5.5.1", |
| "activation_fn_kwargs": { |
| "k": 40 |
| }, |
| "neuronpedia_id": null, |
| "model_from_pretrained_kwargs": { |
| "center_writing_weights": false |
| }, |
| "seqpos_slice": [ |
| null |
| ] |
| }, |
| "eval_result_unstructured": { |
| "LabHC/bias_in_bios_class_set1": { |
| "0": { |
| "tpp_threshold_2_total_metric": 0.009499996900558472, |
| "tpp_threshold_2_intended_diff_only": 0.013999998569488525, |
| "tpp_threshold_2_unintended_diff_only": 0.004500001668930054, |
| "tpp_threshold_5_total_metric": 0.19075000286102295, |
| "tpp_threshold_5_intended_diff_only": 0.36500000953674316, |
| "tpp_threshold_5_unintended_diff_only": 0.17425000667572021, |
| "tpp_threshold_10_total_metric": 0.1575000137090683, |
| "tpp_threshold_10_intended_diff_only": 0.38700002431869507, |
| "tpp_threshold_10_unintended_diff_only": 0.22950001060962677, |
| "tpp_threshold_20_total_metric": 0.15925003588199615, |
| "tpp_threshold_20_intended_diff_only": 0.4180000424385071, |
| "tpp_threshold_20_unintended_diff_only": 0.2587500065565109, |
| "tpp_threshold_50_total_metric": 0.11475001275539398, |
| "tpp_threshold_50_intended_diff_only": 0.43400001525878906, |
| "tpp_threshold_50_unintended_diff_only": 0.3192500025033951, |
| "tpp_threshold_100_total_metric": 0.09700000286102295, |
| "tpp_threshold_100_intended_diff_only": 0.4440000057220459, |
| "tpp_threshold_100_unintended_diff_only": 0.34700000286102295, |
| "tpp_threshold_500_total_metric": 0.04875004291534424, |
| "tpp_threshold_500_intended_diff_only": 0.4450000524520874, |
| "tpp_threshold_500_unintended_diff_only": 0.39625000953674316 |
| }, |
| "1": { |
| "tpp_threshold_2_total_metric": 0.15399998426437378, |
| "tpp_threshold_2_intended_diff_only": 0.3009999990463257, |
| "tpp_threshold_2_unintended_diff_only": 0.1470000147819519, |
| "tpp_threshold_5_total_metric": 0.17549997568130493, |
| "tpp_threshold_5_intended_diff_only": 0.32999998331069946, |
| "tpp_threshold_5_unintended_diff_only": 0.15450000762939453, |
| "tpp_threshold_10_total_metric": 0.16699998080730438, |
| "tpp_threshold_10_intended_diff_only": 0.3579999804496765, |
| "tpp_threshold_10_unintended_diff_only": 0.19099999964237213, |
| "tpp_threshold_20_total_metric": 0.1872500330209732, |
| "tpp_threshold_20_intended_diff_only": 0.3960000276565552, |
| "tpp_threshold_20_unintended_diff_only": 0.20874999463558197, |
| "tpp_threshold_50_total_metric": 0.14925000071525574, |
| "tpp_threshold_50_intended_diff_only": 0.453000009059906, |
| "tpp_threshold_50_unintended_diff_only": 0.30375000834465027, |
| "tpp_threshold_100_total_metric": 0.09850001335144043, |
| "tpp_threshold_100_intended_diff_only": 0.4610000252723694, |
| "tpp_threshold_100_unintended_diff_only": 0.36250001192092896, |
| "tpp_threshold_500_total_metric": 0.06600001454353333, |
| "tpp_threshold_500_intended_diff_only": 0.4610000252723694, |
| "tpp_threshold_500_unintended_diff_only": 0.39500001072883606 |
| }, |
| "2": { |
| "tpp_threshold_2_total_metric": 0.010250046849250793, |
| "tpp_threshold_2_intended_diff_only": 0.016000032424926758, |
| "tpp_threshold_2_unintended_diff_only": 0.005749985575675964, |
| "tpp_threshold_5_total_metric": 0.12700001895427704, |
| "tpp_threshold_5_intended_diff_only": 0.2580000162124634, |
| "tpp_threshold_5_unintended_diff_only": 0.13099999725818634, |
| "tpp_threshold_10_total_metric": 0.13250000774860382, |
| "tpp_threshold_10_intended_diff_only": 0.3199999928474426, |
| "tpp_threshold_10_unintended_diff_only": 0.1874999850988388, |
| "tpp_threshold_20_total_metric": 0.14525003731250763, |
| "tpp_threshold_20_intended_diff_only": 0.38600003719329834, |
| "tpp_threshold_20_unintended_diff_only": 0.2407499998807907, |
| "tpp_threshold_50_total_metric": 0.13050003349781036, |
| "tpp_threshold_50_intended_diff_only": 0.4450000524520874, |
| "tpp_threshold_50_unintended_diff_only": 0.31450001895427704, |
| "tpp_threshold_100_total_metric": 0.08175003528594971, |
| "tpp_threshold_100_intended_diff_only": 0.4500000476837158, |
| "tpp_threshold_100_unintended_diff_only": 0.3682500123977661, |
| "tpp_threshold_500_total_metric": 0.038500040769577026, |
| "tpp_threshold_500_intended_diff_only": 0.4500000476837158, |
| "tpp_threshold_500_unintended_diff_only": 0.4115000069141388 |
| }, |
| "6": { |
| "tpp_threshold_2_total_metric": 0.025499999523162842, |
| "tpp_threshold_2_intended_diff_only": 0.09700000286102295, |
| "tpp_threshold_2_unintended_diff_only": 0.07150000333786011, |
| "tpp_threshold_5_total_metric": 0.14249998331069946, |
| "tpp_threshold_5_intended_diff_only": 0.28299999237060547, |
| "tpp_threshold_5_unintended_diff_only": 0.140500009059906, |
| "tpp_threshold_10_total_metric": 0.20299997925758362, |
| "tpp_threshold_10_intended_diff_only": 0.421999990940094, |
| "tpp_threshold_10_unintended_diff_only": 0.21900001168251038, |
| "tpp_threshold_20_total_metric": 0.1887500137090683, |
| "tpp_threshold_20_intended_diff_only": 0.4650000333786011, |
| "tpp_threshold_20_unintended_diff_only": 0.2762500196695328, |
| "tpp_threshold_50_total_metric": 0.1702500283718109, |
| "tpp_threshold_50_intended_diff_only": 0.4880000352859497, |
| "tpp_threshold_50_unintended_diff_only": 0.3177500069141388, |
| "tpp_threshold_100_total_metric": 0.1380000114440918, |
| "tpp_threshold_100_intended_diff_only": 0.4880000352859497, |
| "tpp_threshold_100_unintended_diff_only": 0.3500000238418579, |
| "tpp_threshold_500_total_metric": 0.10975001752376556, |
| "tpp_threshold_500_intended_diff_only": 0.4880000352859497, |
| "tpp_threshold_500_unintended_diff_only": 0.37825001776218414 |
| }, |
| "9": { |
| "tpp_threshold_2_total_metric": 0.1457499861717224, |
| "tpp_threshold_2_intended_diff_only": 0.20800000429153442, |
| "tpp_threshold_2_unintended_diff_only": 0.06225001811981201, |
| "tpp_threshold_5_total_metric": 0.16350004076957703, |
| "tpp_threshold_5_intended_diff_only": 0.26100003719329834, |
| "tpp_threshold_5_unintended_diff_only": 0.09749999642372131, |
| "tpp_threshold_10_total_metric": 0.29075004160404205, |
| "tpp_threshold_10_intended_diff_only": 0.40400004386901855, |
| "tpp_threshold_10_unintended_diff_only": 0.1132500022649765, |
| "tpp_threshold_20_total_metric": 0.32625000178813934, |
| "tpp_threshold_20_intended_diff_only": 0.4580000042915344, |
| "tpp_threshold_20_unintended_diff_only": 0.13175000250339508, |
| "tpp_threshold_50_total_metric": 0.1650000512599945, |
| "tpp_threshold_50_intended_diff_only": 0.46900004148483276, |
| "tpp_threshold_50_unintended_diff_only": 0.30399999022483826, |
| "tpp_threshold_100_total_metric": 0.1287500411272049, |
| "tpp_threshold_100_intended_diff_only": 0.46900004148483276, |
| "tpp_threshold_100_unintended_diff_only": 0.34025000035762787, |
| "tpp_threshold_500_total_metric": 0.07500003278255463, |
| "tpp_threshold_500_intended_diff_only": 0.46900004148483276, |
| "tpp_threshold_500_unintended_diff_only": 0.39400000870227814 |
| } |
| }, |
| "canrager/amazon_reviews_mcauley_1and5": { |
| "1": { |
| "tpp_threshold_2_total_metric": -0.0029999613761901855, |
| "tpp_threshold_2_intended_diff_only": -0.001999974250793457, |
| "tpp_threshold_2_unintended_diff_only": 0.0009999871253967285, |
| "tpp_threshold_5_total_metric": 0.0030000507831573486, |
| "tpp_threshold_5_intended_diff_only": 0.001000046730041504, |
| "tpp_threshold_5_unintended_diff_only": -0.0020000040531158447, |
| "tpp_threshold_10_total_metric": 0.07500004768371582, |
| "tpp_threshold_10_intended_diff_only": 0.09400004148483276, |
| "tpp_threshold_10_unintended_diff_only": 0.018999993801116943, |
| "tpp_threshold_20_total_metric": 0.18800005316734314, |
| "tpp_threshold_20_intended_diff_only": 0.21800005435943604, |
| "tpp_threshold_20_unintended_diff_only": 0.030000001192092896, |
| "tpp_threshold_50_total_metric": 0.3445000499486923, |
| "tpp_threshold_50_intended_diff_only": 0.38600003719329834, |
| "tpp_threshold_50_unintended_diff_only": 0.04149998724460602, |
| "tpp_threshold_100_total_metric": 0.3765000253915787, |
| "tpp_threshold_100_intended_diff_only": 0.4240000247955322, |
| "tpp_threshold_100_unintended_diff_only": 0.04749999940395355, |
| "tpp_threshold_500_total_metric": 0.3422500640153885, |
| "tpp_threshold_500_intended_diff_only": 0.43000006675720215, |
| "tpp_threshold_500_unintended_diff_only": 0.08775000274181366 |
| }, |
| "2": { |
| "tpp_threshold_2_total_metric": 0.004500001668930054, |
| "tpp_threshold_2_intended_diff_only": 0.009000003337860107, |
| "tpp_threshold_2_unintended_diff_only": 0.004500001668930054, |
| "tpp_threshold_5_total_metric": 0.022250011563301086, |
| "tpp_threshold_5_intended_diff_only": 0.03100001811981201, |
| "tpp_threshold_5_unintended_diff_only": 0.008750006556510925, |
| "tpp_threshold_10_total_metric": 0.045750051736831665, |
| "tpp_threshold_10_intended_diff_only": 0.05200004577636719, |
| "tpp_threshold_10_unintended_diff_only": 0.0062499940395355225, |
| "tpp_threshold_20_total_metric": 0.09675002098083496, |
| "tpp_threshold_20_intended_diff_only": 0.11900001764297485, |
| "tpp_threshold_20_unintended_diff_only": 0.022249996662139893, |
| "tpp_threshold_50_total_metric": 0.2290000319480896, |
| "tpp_threshold_50_intended_diff_only": 0.26200002431869507, |
| "tpp_threshold_50_unintended_diff_only": 0.03299999237060547, |
| "tpp_threshold_100_total_metric": 0.30150000751018524, |
| "tpp_threshold_100_intended_diff_only": 0.35500001907348633, |
| "tpp_threshold_100_unintended_diff_only": 0.053500011563301086, |
| "tpp_threshold_500_total_metric": 0.2872500568628311, |
| "tpp_threshold_500_intended_diff_only": 0.42100006341934204, |
| "tpp_threshold_500_unintended_diff_only": 0.13375000655651093 |
| }, |
| "3": { |
| "tpp_threshold_2_total_metric": 0.0004999786615371704, |
| "tpp_threshold_2_intended_diff_only": -0.0040000081062316895, |
| "tpp_threshold_2_unintended_diff_only": -0.00449998676776886, |
| "tpp_threshold_5_total_metric": -0.007500052452087402, |
| "tpp_threshold_5_intended_diff_only": -0.0020000338554382324, |
| "tpp_threshold_5_unintended_diff_only": 0.00550001859664917, |
| "tpp_threshold_10_total_metric": -0.0017500072717666626, |
| "tpp_threshold_10_intended_diff_only": 0.004999995231628418, |
| "tpp_threshold_10_unintended_diff_only": 0.006750002503395081, |
| "tpp_threshold_20_total_metric": 0.051749974489212036, |
| "tpp_threshold_20_intended_diff_only": 0.05799996852874756, |
| "tpp_threshold_20_unintended_diff_only": 0.0062499940395355225, |
| "tpp_threshold_50_total_metric": 0.36625002324581146, |
| "tpp_threshold_50_intended_diff_only": 0.37400001287460327, |
| "tpp_threshold_50_unintended_diff_only": 0.007749989628791809, |
| "tpp_threshold_100_total_metric": 0.37599998712539673, |
| "tpp_threshold_100_intended_diff_only": 0.39800000190734863, |
| "tpp_threshold_100_unintended_diff_only": 0.022000014781951904, |
| "tpp_threshold_500_total_metric": 0.325500026345253, |
| "tpp_threshold_500_intended_diff_only": 0.40000003576278687, |
| "tpp_threshold_500_unintended_diff_only": 0.07450000941753387 |
| }, |
| "5": { |
| "tpp_threshold_2_total_metric": 0.01725001633167267, |
| "tpp_threshold_2_intended_diff_only": 0.021000027656555176, |
| "tpp_threshold_2_unintended_diff_only": 0.0037500113248825073, |
| "tpp_threshold_5_total_metric": 0.011000007390975952, |
| "tpp_threshold_5_intended_diff_only": 0.017000019550323486, |
| "tpp_threshold_5_unintended_diff_only": 0.006000012159347534, |
| "tpp_threshold_10_total_metric": 0.0157499760389328, |
| "tpp_threshold_10_intended_diff_only": 0.019999980926513672, |
| "tpp_threshold_10_unintended_diff_only": 0.004250004887580872, |
| "tpp_threshold_20_total_metric": 0.07399998605251312, |
| "tpp_threshold_20_intended_diff_only": 0.09799998998641968, |
| "tpp_threshold_20_unintended_diff_only": 0.024000003933906555, |
| "tpp_threshold_50_total_metric": 0.21400003135204315, |
| "tpp_threshold_50_intended_diff_only": 0.2560000419616699, |
| "tpp_threshold_50_unintended_diff_only": 0.04200001060962677, |
| "tpp_threshold_100_total_metric": 0.31300000846385956, |
| "tpp_threshold_100_intended_diff_only": 0.37800002098083496, |
| "tpp_threshold_100_unintended_diff_only": 0.0650000125169754, |
| "tpp_threshold_500_total_metric": 0.2850000262260437, |
| "tpp_threshold_500_intended_diff_only": 0.40800005197525024, |
| "tpp_threshold_500_unintended_diff_only": 0.12300002574920654 |
| }, |
| "6": { |
| "tpp_threshold_2_total_metric": -0.0012500286102294922, |
| "tpp_threshold_2_intended_diff_only": 0.001999974250793457, |
| "tpp_threshold_2_unintended_diff_only": 0.0032500028610229492, |
| "tpp_threshold_5_total_metric": 0.01774999499320984, |
| "tpp_threshold_5_intended_diff_only": 0.018999993801116943, |
| "tpp_threshold_5_unintended_diff_only": 0.0012499988079071045, |
| "tpp_threshold_10_total_metric": 0.042249977588653564, |
| "tpp_threshold_10_intended_diff_only": 0.05199998617172241, |
| "tpp_threshold_10_unintended_diff_only": 0.009750008583068848, |
| "tpp_threshold_20_total_metric": 0.11449997127056122, |
| "tpp_threshold_20_intended_diff_only": 0.12599998712539673, |
| "tpp_threshold_20_unintended_diff_only": 0.01150001585483551, |
| "tpp_threshold_50_total_metric": 0.2764999717473984, |
| "tpp_threshold_50_intended_diff_only": 0.2979999780654907, |
| "tpp_threshold_50_unintended_diff_only": 0.021500006318092346, |
| "tpp_threshold_100_total_metric": 0.314750000834465, |
| "tpp_threshold_100_intended_diff_only": 0.35600000619888306, |
| "tpp_threshold_100_unintended_diff_only": 0.04125000536441803, |
| "tpp_threshold_500_total_metric": 0.265749990940094, |
| "tpp_threshold_500_intended_diff_only": 0.36000001430511475, |
| "tpp_threshold_500_unintended_diff_only": 0.09425002336502075 |
| } |
| } |
| } |
| } |