| { |
| "eval_type_id": "tpp", |
| "eval_config": { |
| "random_seed": 42, |
| "dataset_names": [ |
| "LabHC/bias_in_bios_class_set1", |
| "canrager/amazon_reviews_mcauley_1and5" |
| ], |
| "perform_scr": false, |
| "early_stopping_patience": 20, |
| "train_set_size": 4000, |
| "test_set_size": 1000, |
| "context_length": 128, |
| "probe_train_batch_size": 16, |
| "probe_test_batch_size": 500, |
| "probe_epochs": 20, |
| "probe_lr": 0.001, |
| "probe_l1_penalty": 0.001, |
| "sae_batch_size": 125, |
| "llm_batch_size": 32, |
| "llm_dtype": "bfloat16", |
| "lower_vram_usage": false, |
| "model_name": "gemma-2-2b", |
| "n_values": [ |
| 2, |
| 5, |
| 10, |
| 20, |
| 50, |
| 100, |
| 500 |
| ], |
| "column1_vals_lookup": { |
| "LabHC/bias_in_bios_class_set1": [ |
| [ |
| "professor", |
| "nurse" |
| ], |
| [ |
| "architect", |
| "journalist" |
| ], |
| [ |
| "surgeon", |
| "psychologist" |
| ], |
| [ |
| "attorney", |
| "teacher" |
| ] |
| ], |
| "canrager/amazon_reviews_mcauley_1and5": [ |
| [ |
| "Books", |
| "CDs_and_Vinyl" |
| ], |
| [ |
| "Software", |
| "Electronics" |
| ], |
| [ |
| "Pet_Supplies", |
| "Office_Products" |
| ], |
| [ |
| "Industrial_and_Scientific", |
| "Toys_and_Games" |
| ] |
| ] |
| } |
| }, |
| "eval_id": "65b37170-42a3-4f63-bfdc-b4728a912b3a", |
| "datetime_epoch_millis": 1745619623162, |
| "eval_result_metrics": { |
| "tpp_metrics": { |
| "tpp_threshold_2_total_metric": 0.05692499876022339, |
| "tpp_threshold_2_intended_diff_only": 0.06589999794960022, |
| "tpp_threshold_2_unintended_diff_only": 0.008974999189376831, |
| "tpp_threshold_5_total_metric": 0.1515250042080879, |
| "tpp_threshold_5_intended_diff_only": 0.1965000033378601, |
| "tpp_threshold_5_unintended_diff_only": 0.04497499912977219, |
| "tpp_threshold_10_total_metric": 0.18625001311302186, |
| "tpp_threshold_10_intended_diff_only": 0.2580000102519989, |
| "tpp_threshold_10_unintended_diff_only": 0.07174999713897705, |
| "tpp_threshold_20_total_metric": 0.23035001009702682, |
| "tpp_threshold_20_intended_diff_only": 0.33450000882148745, |
| "tpp_threshold_20_unintended_diff_only": 0.10414999872446061, |
| "tpp_threshold_50_total_metric": 0.2436500072479248, |
| "tpp_threshold_50_intended_diff_only": 0.39450001120567324, |
| "tpp_threshold_50_unintended_diff_only": 0.15085000395774842, |
| "tpp_threshold_100_total_metric": 0.25500001311302184, |
| "tpp_threshold_100_intended_diff_only": 0.4299000144004822, |
| "tpp_threshold_100_unintended_diff_only": 0.17490000128746033, |
| "tpp_threshold_500_total_metric": 0.2356000304222107, |
| "tpp_threshold_500_intended_diff_only": 0.4502000391483307, |
| "tpp_threshold_500_unintended_diff_only": 0.21460000872612 |
| } |
| }, |
| "eval_result_details": [ |
| { |
| "dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results", |
| "tpp_threshold_2_total_metric": 0.08565000593662261, |
| "tpp_threshold_2_intended_diff_only": 0.0974000096321106, |
| "tpp_threshold_2_unintended_diff_only": 0.011750003695487976, |
| "tpp_threshold_5_total_metric": 0.2209500104188919, |
| "tpp_threshold_5_intended_diff_only": 0.2974000215530396, |
| "tpp_threshold_5_unintended_diff_only": 0.07645001113414765, |
| "tpp_threshold_10_total_metric": 0.23805002272129058, |
| "tpp_threshold_10_intended_diff_only": 0.36360002756118776, |
| "tpp_threshold_10_unintended_diff_only": 0.12555000483989714, |
| "tpp_threshold_20_total_metric": 0.2441500097513199, |
| "tpp_threshold_20_intended_diff_only": 0.42920001745224, |
| "tpp_threshold_20_unintended_diff_only": 0.18505000770092012, |
| "tpp_threshold_50_total_metric": 0.18515000343322754, |
| "tpp_threshold_50_intended_diff_only": 0.457800018787384, |
| "tpp_threshold_50_unintended_diff_only": 0.2726500153541565, |
| "tpp_threshold_100_total_metric": 0.1491000235080719, |
| "tpp_threshold_100_intended_diff_only": 0.4654000401496887, |
| "tpp_threshold_100_unintended_diff_only": 0.31630001664161683, |
| "tpp_threshold_500_total_metric": 0.08970001637935639, |
| "tpp_threshold_500_intended_diff_only": 0.4676000475883484, |
| "tpp_threshold_500_unintended_diff_only": 0.377900031208992 |
| }, |
| { |
| "dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results", |
| "tpp_threshold_2_total_metric": 0.028199991583824156, |
| "tpp_threshold_2_intended_diff_only": 0.034399986267089844, |
| "tpp_threshold_2_unintended_diff_only": 0.006199994683265686, |
| "tpp_threshold_5_total_metric": 0.08209999799728393, |
| "tpp_threshold_5_intended_diff_only": 0.09559998512268067, |
| "tpp_threshold_5_unintended_diff_only": 0.01349998712539673, |
| "tpp_threshold_10_total_metric": 0.1344500035047531, |
| "tpp_threshold_10_intended_diff_only": 0.15239999294281006, |
| "tpp_threshold_10_unintended_diff_only": 0.017949989438056944, |
| "tpp_threshold_20_total_metric": 0.21655001044273375, |
| "tpp_threshold_20_intended_diff_only": 0.23980000019073486, |
| "tpp_threshold_20_unintended_diff_only": 0.023249989748001097, |
| "tpp_threshold_50_total_metric": 0.30215001106262207, |
| "tpp_threshold_50_intended_diff_only": 0.3312000036239624, |
| "tpp_threshold_50_unintended_diff_only": 0.029049992561340332, |
| "tpp_threshold_100_total_metric": 0.3609000027179718, |
| "tpp_threshold_100_intended_diff_only": 0.3943999886512756, |
| "tpp_threshold_100_unintended_diff_only": 0.03349998593330383, |
| "tpp_threshold_500_total_metric": 0.381500044465065, |
| "tpp_threshold_500_intended_diff_only": 0.43280003070831297, |
| "tpp_threshold_500_unintended_diff_only": 0.05129998624324798 |
| } |
| ], |
| "sae_bench_commit_hash": "Unknown", |
| "sae_lens_id": "blocks.18.hook_resid_post", |
| "sae_lens_release_id": "gemma-2-2b-res-snap-matryoshka-dc", |
| "sae_lens_version": "5.9.1", |
| "sae_cfg_dict": { |
| "architecture": "jumprelu", |
| "d_in": 2304, |
| "d_sae": 32768, |
| "activation_fn_str": "relu", |
| "apply_b_dec_to_input": true, |
| "finetuning_scaling_factor": false, |
| "context_size": 1024, |
| "model_name": "gemma-2-2b", |
| "hook_name": "blocks.18.hook_resid_post", |
| "hook_layer": 18, |
| "hook_head_index": null, |
| "prepend_bos": true, |
| "dataset_path": "chanind/pile-uncopyrighted-gemma-1024-abbrv-1B", |
| "dataset_trust_remote_code": true, |
| "normalize_activations": "none", |
| "dtype": "torch.bfloat16", |
| "device": "cuda", |
| "sae_lens_training_version": "5.5.1", |
| "activation_fn_kwargs": { |
| "k": 40 |
| }, |
| "neuronpedia_id": null, |
| "model_from_pretrained_kwargs": { |
| "center_writing_weights": false |
| }, |
| "seqpos_slice": [ |
| null |
| ] |
| }, |
| "eval_result_unstructured": { |
| "LabHC/bias_in_bios_class_set1": { |
| "0": { |
| "tpp_threshold_2_total_metric": 0.06900003552436829, |
| "tpp_threshold_2_intended_diff_only": 0.07600003480911255, |
| "tpp_threshold_2_unintended_diff_only": 0.006999999284744263, |
| "tpp_threshold_5_total_metric": 0.21800002455711365, |
| "tpp_threshold_5_intended_diff_only": 0.2890000343322754, |
| "tpp_threshold_5_unintended_diff_only": 0.07100000977516174, |
| "tpp_threshold_10_total_metric": 0.22950004041194916, |
| "tpp_threshold_10_intended_diff_only": 0.3760000467300415, |
| "tpp_threshold_10_unintended_diff_only": 0.14650000631809235, |
| "tpp_threshold_20_total_metric": 0.2147500216960907, |
| "tpp_threshold_20_intended_diff_only": 0.4190000295639038, |
| "tpp_threshold_20_unintended_diff_only": 0.2042500078678131, |
| "tpp_threshold_50_total_metric": 0.18800005316734314, |
| "tpp_threshold_50_intended_diff_only": 0.440000057220459, |
| "tpp_threshold_50_unintended_diff_only": 0.25200000405311584, |
| "tpp_threshold_100_total_metric": 0.14325004816055298, |
| "tpp_threshold_100_intended_diff_only": 0.4490000605583191, |
| "tpp_threshold_100_unintended_diff_only": 0.3057500123977661, |
| "tpp_threshold_500_total_metric": 0.08250002562999725, |
| "tpp_threshold_500_intended_diff_only": 0.4540000557899475, |
| "tpp_threshold_500_unintended_diff_only": 0.37150003015995026 |
| }, |
| "1": { |
| "tpp_threshold_2_total_metric": 0.06499996781349182, |
| "tpp_threshold_2_intended_diff_only": 0.08099997043609619, |
| "tpp_threshold_2_unintended_diff_only": 0.01600000262260437, |
| "tpp_threshold_5_total_metric": 0.18800000846385956, |
| "tpp_threshold_5_intended_diff_only": 0.2850000262260437, |
| "tpp_threshold_5_unintended_diff_only": 0.09700001776218414, |
| "tpp_threshold_10_total_metric": 0.22500000894069672, |
| "tpp_threshold_10_intended_diff_only": 0.328000009059906, |
| "tpp_threshold_10_unintended_diff_only": 0.10300000011920929, |
| "tpp_threshold_20_total_metric": 0.1667499989271164, |
| "tpp_threshold_20_intended_diff_only": 0.39800000190734863, |
| "tpp_threshold_20_unintended_diff_only": 0.23125000298023224, |
| "tpp_threshold_50_total_metric": 0.12874998152256012, |
| "tpp_threshold_50_intended_diff_only": 0.453000009059906, |
| "tpp_threshold_50_unintended_diff_only": 0.3242500275373459, |
| "tpp_threshold_100_total_metric": 0.10175001621246338, |
| "tpp_threshold_100_intended_diff_only": 0.4650000333786011, |
| "tpp_threshold_100_unintended_diff_only": 0.3632500171661377, |
| "tpp_threshold_500_total_metric": 0.06350000202655792, |
| "tpp_threshold_500_intended_diff_only": 0.4650000333786011, |
| "tpp_threshold_500_unintended_diff_only": 0.40150003135204315 |
| }, |
| "2": { |
| "tpp_threshold_2_total_metric": 0.18275001645088196, |
| "tpp_threshold_2_intended_diff_only": 0.19300001859664917, |
| "tpp_threshold_2_unintended_diff_only": 0.010250002145767212, |
| "tpp_threshold_5_total_metric": 0.3080000728368759, |
| "tpp_threshold_5_intended_diff_only": 0.35600006580352783, |
| "tpp_threshold_5_unintended_diff_only": 0.047999992966651917, |
| "tpp_threshold_10_total_metric": 0.2915000468492508, |
| "tpp_threshold_10_intended_diff_only": 0.3960000276565552, |
| "tpp_threshold_10_unintended_diff_only": 0.10449998080730438, |
| "tpp_threshold_20_total_metric": 0.2990000396966934, |
| "tpp_threshold_20_intended_diff_only": 0.4280000329017639, |
| "tpp_threshold_20_unintended_diff_only": 0.1289999932050705, |
| "tpp_threshold_50_total_metric": 0.2057500183582306, |
| "tpp_threshold_50_intended_diff_only": 0.44200003147125244, |
| "tpp_threshold_50_unintended_diff_only": 0.23625001311302185, |
| "tpp_threshold_100_total_metric": 0.1615000218153, |
| "tpp_threshold_100_intended_diff_only": 0.45100003480911255, |
| "tpp_threshold_100_unintended_diff_only": 0.28950001299381256, |
| "tpp_threshold_500_total_metric": 0.06200005114078522, |
| "tpp_threshold_500_intended_diff_only": 0.4530000686645508, |
| "tpp_threshold_500_unintended_diff_only": 0.39100001752376556 |
| }, |
| "6": { |
| "tpp_threshold_2_total_metric": 0.022250041365623474, |
| "tpp_threshold_2_intended_diff_only": 0.03400003910064697, |
| "tpp_threshold_2_unintended_diff_only": 0.011749997735023499, |
| "tpp_threshold_5_total_metric": 0.15275000035762787, |
| "tpp_threshold_5_intended_diff_only": 0.22100001573562622, |
| "tpp_threshold_5_unintended_diff_only": 0.06825001537799835, |
| "tpp_threshold_10_total_metric": 0.22100003063678741, |
| "tpp_threshold_10_intended_diff_only": 0.32100003957748413, |
| "tpp_threshold_10_unintended_diff_only": 0.10000000894069672, |
| "tpp_threshold_20_total_metric": 0.3200000077486038, |
| "tpp_threshold_20_intended_diff_only": 0.4570000171661377, |
| "tpp_threshold_20_unintended_diff_only": 0.13700000941753387, |
| "tpp_threshold_50_total_metric": 0.22550001740455627, |
| "tpp_threshold_50_intended_diff_only": 0.48000001907348633, |
| "tpp_threshold_50_unintended_diff_only": 0.25450000166893005, |
| "tpp_threshold_100_total_metric": 0.19100002944469452, |
| "tpp_threshold_100_intended_diff_only": 0.487000048160553, |
| "tpp_threshold_100_unintended_diff_only": 0.29600001871585846, |
| "tpp_threshold_500_total_metric": 0.14150002598762512, |
| "tpp_threshold_500_intended_diff_only": 0.49100005626678467, |
| "tpp_threshold_500_unintended_diff_only": 0.34950003027915955 |
| }, |
| "9": { |
| "tpp_threshold_2_total_metric": 0.08924996852874756, |
| "tpp_threshold_2_intended_diff_only": 0.1029999852180481, |
| "tpp_threshold_2_unintended_diff_only": 0.013750016689300537, |
| "tpp_threshold_5_total_metric": 0.23799994587898254, |
| "tpp_threshold_5_intended_diff_only": 0.3359999656677246, |
| "tpp_threshold_5_unintended_diff_only": 0.09800001978874207, |
| "tpp_threshold_10_total_metric": 0.22324998676776886, |
| "tpp_threshold_10_intended_diff_only": 0.3970000147819519, |
| "tpp_threshold_10_unintended_diff_only": 0.17375002801418304, |
| "tpp_threshold_20_total_metric": 0.2202499806880951, |
| "tpp_threshold_20_intended_diff_only": 0.4440000057220459, |
| "tpp_threshold_20_unintended_diff_only": 0.2237500250339508, |
| "tpp_threshold_50_total_metric": 0.17774994671344757, |
| "tpp_threshold_50_intended_diff_only": 0.4739999771118164, |
| "tpp_threshold_50_unintended_diff_only": 0.29625003039836884, |
| "tpp_threshold_100_total_metric": 0.14800000190734863, |
| "tpp_threshold_100_intended_diff_only": 0.4750000238418579, |
| "tpp_threshold_100_unintended_diff_only": 0.3270000219345093, |
| "tpp_threshold_500_total_metric": 0.0989999771118164, |
| "tpp_threshold_500_intended_diff_only": 0.4750000238418579, |
| "tpp_threshold_500_unintended_diff_only": 0.3760000467300415 |
| } |
| }, |
| "canrager/amazon_reviews_mcauley_1and5": { |
| "1": { |
| "tpp_threshold_2_total_metric": 0.013249948620796204, |
| "tpp_threshold_2_intended_diff_only": 0.01799994707107544, |
| "tpp_threshold_2_unintended_diff_only": 0.004749998450279236, |
| "tpp_threshold_5_total_metric": 0.014749959111213684, |
| "tpp_threshold_5_intended_diff_only": 0.026999950408935547, |
| "tpp_threshold_5_unintended_diff_only": 0.012249991297721863, |
| "tpp_threshold_10_total_metric": 0.06425000727176666, |
| "tpp_threshold_10_intended_diff_only": 0.08799999952316284, |
| "tpp_threshold_10_unintended_diff_only": 0.02374999225139618, |
| "tpp_threshold_20_total_metric": 0.21950000524520874, |
| "tpp_threshold_20_intended_diff_only": 0.2549999952316284, |
| "tpp_threshold_20_unintended_diff_only": 0.03549998998641968, |
| "tpp_threshold_50_total_metric": 0.32649996876716614, |
| "tpp_threshold_50_intended_diff_only": 0.36799997091293335, |
| "tpp_threshold_50_unintended_diff_only": 0.04150000214576721, |
| "tpp_threshold_100_total_metric": 0.3934999853372574, |
| "tpp_threshold_100_intended_diff_only": 0.437999963760376, |
| "tpp_threshold_100_unintended_diff_only": 0.04449997842311859, |
| "tpp_threshold_500_total_metric": 0.4032500237226486, |
| "tpp_threshold_500_intended_diff_only": 0.4570000171661377, |
| "tpp_threshold_500_unintended_diff_only": 0.053749993443489075 |
| }, |
| "2": { |
| "tpp_threshold_2_total_metric": 0.015750035643577576, |
| "tpp_threshold_2_intended_diff_only": 0.026000022888183594, |
| "tpp_threshold_2_unintended_diff_only": 0.010249987244606018, |
| "tpp_threshold_5_total_metric": 0.15200001001358032, |
| "tpp_threshold_5_intended_diff_only": 0.1809999942779541, |
| "tpp_threshold_5_unintended_diff_only": 0.02899998426437378, |
| "tpp_threshold_10_total_metric": 0.23850001394748688, |
| "tpp_threshold_10_intended_diff_only": 0.27300000190734863, |
| "tpp_threshold_10_unintended_diff_only": 0.034499987959861755, |
| "tpp_threshold_20_total_metric": 0.32900001108646393, |
| "tpp_threshold_20_intended_diff_only": 0.3659999966621399, |
| "tpp_threshold_20_unintended_diff_only": 0.036999985575675964, |
| "tpp_threshold_50_total_metric": 0.3785000443458557, |
| "tpp_threshold_50_intended_diff_only": 0.4280000329017639, |
| "tpp_threshold_50_unintended_diff_only": 0.0494999885559082, |
| "tpp_threshold_100_total_metric": 0.38700003921985626, |
| "tpp_threshold_100_intended_diff_only": 0.44300001859664917, |
| "tpp_threshold_100_unintended_diff_only": 0.05599997937679291, |
| "tpp_threshold_500_total_metric": 0.36750006675720215, |
| "tpp_threshold_500_intended_diff_only": 0.4450000524520874, |
| "tpp_threshold_500_unintended_diff_only": 0.07749998569488525 |
| }, |
| "3": { |
| "tpp_threshold_2_total_metric": 0.00449998676776886, |
| "tpp_threshold_2_intended_diff_only": 0.004999995231628418, |
| "tpp_threshold_2_unintended_diff_only": 0.0005000084638595581, |
| "tpp_threshold_5_total_metric": -0.004000023007392883, |
| "tpp_threshold_5_intended_diff_only": 0.001999974250793457, |
| "tpp_threshold_5_unintended_diff_only": 0.00599999725818634, |
| "tpp_threshold_10_total_metric": 0.006749957799911499, |
| "tpp_threshold_10_intended_diff_only": 0.012999951839447021, |
| "tpp_threshold_10_unintended_diff_only": 0.0062499940395355225, |
| "tpp_threshold_20_total_metric": 0.028749987483024597, |
| "tpp_threshold_20_intended_diff_only": 0.03700000047683716, |
| "tpp_threshold_20_unintended_diff_only": 0.008250012993812561, |
| "tpp_threshold_50_total_metric": 0.10825000703334808, |
| "tpp_threshold_50_intended_diff_only": 0.12099999189376831, |
| "tpp_threshold_50_unintended_diff_only": 0.012749984860420227, |
| "tpp_threshold_100_total_metric": 0.26399996876716614, |
| "tpp_threshold_100_intended_diff_only": 0.2759999632835388, |
| "tpp_threshold_100_unintended_diff_only": 0.01199999451637268, |
| "tpp_threshold_500_total_metric": 0.3967500329017639, |
| "tpp_threshold_500_intended_diff_only": 0.42900002002716064, |
| "tpp_threshold_500_unintended_diff_only": 0.03224998712539673 |
| }, |
| "5": { |
| "tpp_threshold_2_total_metric": 0.04224999248981476, |
| "tpp_threshold_2_intended_diff_only": 0.04799997806549072, |
| "tpp_threshold_2_unintended_diff_only": 0.005749985575675964, |
| "tpp_threshold_5_total_metric": 0.10975003242492676, |
| "tpp_threshold_5_intended_diff_only": 0.12000000476837158, |
| "tpp_threshold_5_unintended_diff_only": 0.010249972343444824, |
| "tpp_threshold_10_total_metric": 0.16300004720687866, |
| "tpp_threshold_10_intended_diff_only": 0.17400002479553223, |
| "tpp_threshold_10_unintended_diff_only": 0.010999977588653564, |
| "tpp_threshold_20_total_metric": 0.22175002098083496, |
| "tpp_threshold_20_intended_diff_only": 0.24000000953674316, |
| "tpp_threshold_20_unintended_diff_only": 0.018249988555908203, |
| "tpp_threshold_50_total_metric": 0.3395000398159027, |
| "tpp_threshold_50_intended_diff_only": 0.3630000352859497, |
| "tpp_threshold_50_unintended_diff_only": 0.023499995470046997, |
| "tpp_threshold_100_total_metric": 0.3947499990463257, |
| "tpp_threshold_100_intended_diff_only": 0.421999990940094, |
| "tpp_threshold_100_unintended_diff_only": 0.02724999189376831, |
| "tpp_threshold_500_total_metric": 0.3850000500679016, |
| "tpp_threshold_500_intended_diff_only": 0.437000036239624, |
| "tpp_threshold_500_unintended_diff_only": 0.05199998617172241 |
| }, |
| "6": { |
| "tpp_threshold_2_total_metric": 0.06524999439716339, |
| "tpp_threshold_2_intended_diff_only": 0.07499998807907104, |
| "tpp_threshold_2_unintended_diff_only": 0.009749993681907654, |
| "tpp_threshold_5_total_metric": 0.1380000114440918, |
| "tpp_threshold_5_intended_diff_only": 0.14800000190734863, |
| "tpp_threshold_5_unintended_diff_only": 0.009999990463256836, |
| "tpp_threshold_10_total_metric": 0.19974999129772186, |
| "tpp_threshold_10_intended_diff_only": 0.21399998664855957, |
| "tpp_threshold_10_unintended_diff_only": 0.014249995350837708, |
| "tpp_threshold_20_total_metric": 0.2837500274181366, |
| "tpp_threshold_20_intended_diff_only": 0.3009999990463257, |
| "tpp_threshold_20_unintended_diff_only": 0.017249971628189087, |
| "tpp_threshold_50_total_metric": 0.3579999953508377, |
| "tpp_threshold_50_intended_diff_only": 0.37599998712539673, |
| "tpp_threshold_50_unintended_diff_only": 0.01799999177455902, |
| "tpp_threshold_100_total_metric": 0.36525002121925354, |
| "tpp_threshold_100_intended_diff_only": 0.3930000066757202, |
| "tpp_threshold_100_unintended_diff_only": 0.027749985456466675, |
| "tpp_threshold_500_total_metric": 0.3550000488758087, |
| "tpp_threshold_500_intended_diff_only": 0.3960000276565552, |
| "tpp_threshold_500_unintended_diff_only": 0.04099997878074646 |
| } |
| } |
| } |
| } |