Upload 12 files

a795f8c verified 3 days ago

11.3 kB

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
arc_easy	1	none	0	acc	↑	0.5223	±	0.0102
		none	0	acc_norm	↑	0.4600	±	0.0102
blimp	2	none		acc	↑	0.7631	±	0.0014
- blimp_adjunct_island	1	none	0	acc	↑	0.8420	±	0.0115
- blimp_anaphor_gender_agreement	1	none	0	acc	↑	0.8430	±	0.0115
- blimp_anaphor_number_agreement	1	none	0	acc	↑	0.9620	±	0.0060
- blimp_animate_subject_passive	1	none	0	acc	↑	0.7820	±	0.0131
- blimp_animate_subject_trans	1	none	0	acc	↑	0.8040	±	0.0126
- blimp_causative	1	none	0	acc	↑	0.6980	±	0.0145
- blimp_complex_NP_island	1	none	0	acc	↑	0.4940	±	0.0158
- blimp_coordinate_structure_constraint_complex_left_branch	1	none	0	acc	↑	0.7420	±	0.0138
- blimp_coordinate_structure_constraint_object_extraction	1	none	0	acc	↑	0.7520	±	0.0137
- blimp_determiner_noun_agreement_1	1	none	0	acc	↑	0.9790	±	0.0045
- blimp_determiner_noun_agreement_2	1	none	0	acc	↑	0.9680	±	0.0056
- blimp_determiner_noun_agreement_irregular_1	1	none	0	acc	↑	0.8990	±	0.0095
- blimp_determiner_noun_agreement_irregular_2	1	none	0	acc	↑	0.9650	±	0.0058
- blimp_determiner_noun_agreement_with_adj_2	1	none	0	acc	↑	0.9340	±	0.0079
- blimp_determiner_noun_agreement_with_adj_irregular_1	1	none	0	acc	↑	0.8740	±	0.0105
- blimp_determiner_noun_agreement_with_adj_irregular_2	1	none	0	acc	↑	0.9270	±	0.0082
- blimp_determiner_noun_agreement_with_adjective_1	1	none	0	acc	↑	0.9410	±	0.0075
- blimp_distractor_agreement_relational_noun	1	none	0	acc	↑	0.8780	±	0.0104
- blimp_distractor_agreement_relative_clause	1	none	0	acc	↑	0.7210	±	0.0142
- blimp_drop_argument	1	none	0	acc	↑	0.7500	±	0.0137
- blimp_ellipsis_n_bar_1	1	none	0	acc	↑	0.8060	±	0.0125
- blimp_ellipsis_n_bar_2	1	none	0	acc	↑	0.8820	±	0.0102
- blimp_existential_there_object_raising	1	none	0	acc	↑	0.8750	±	0.0105
- blimp_existential_there_quantifiers_1	1	none	0	acc	↑	0.9730	±	0.0051
- blimp_existential_there_quantifiers_2	1	none	0	acc	↑	0.2070	±	0.0128
- blimp_existential_there_subject_raising	1	none	0	acc	↑	0.8810	±	0.0102
- blimp_expletive_it_object_raising	1	none	0	acc	↑	0.7830	±	0.0130
- blimp_inchoative	1	none	0	acc	↑	0.6330	±	0.0152
- blimp_intransitive	1	none	0	acc	↑	0.7310	±	0.0140
- blimp_irregular_past_participle_adjectives	1	none	0	acc	↑	0.8620	±	0.0109
- blimp_irregular_past_participle_verbs	1	none	0	acc	↑	0.8930	±	0.0098
- blimp_irregular_plural_subject_verb_agreement_1	1	none	0	acc	↑	0.8990	±	0.0095
- blimp_irregular_plural_subject_verb_agreement_2	1	none	0	acc	↑	0.9030	±	0.0094
- blimp_left_branch_island_echo_question	1	none	0	acc	↑	0.3810	±	0.0154
- blimp_left_branch_island_simple_question	1	none	0	acc	↑	0.6470	±	0.0151
- blimp_matrix_question_npi_licensor_present	1	none	0	acc	↑	0.1260	±	0.0105
- blimp_npi_present_1	1	none	0	acc	↑	0.5710	±	0.0157
- blimp_npi_present_2	1	none	0	acc	↑	0.6190	±	0.0154
- blimp_only_npi_licensor_present	1	none	0	acc	↑	0.6250	±	0.0153
- blimp_only_npi_scope	1	none	0	acc	↑	0.5360	±	0.0158
- blimp_passive_1	1	none	0	acc	↑	0.8770	±	0.0104
- blimp_passive_2	1	none	0	acc	↑	0.8840	±	0.0101
- blimp_principle_A_c_command	1	none	0	acc	↑	0.5560	±	0.0157
- blimp_principle_A_case_1	1	none	0	acc	↑	1.0000	±	0
- blimp_principle_A_case_2	1	none	0	acc	↑	0.9650	±	0.0058
- blimp_principle_A_domain_1	1	none	0	acc	↑	0.9430	±	0.0073
- blimp_principle_A_domain_2	1	none	0	acc	↑	0.8040	±	0.0126
- blimp_principle_A_domain_3	1	none	0	acc	↑	0.5200	±	0.0158
- blimp_principle_A_reconstruction	1	none	0	acc	↑	0.2920	±	0.0144
- blimp_regular_plural_subject_verb_agreement_1	1	none	0	acc	↑	0.8930	±	0.0098
- blimp_regular_plural_subject_verb_agreement_2	1	none	0	acc	↑	0.9110	±	0.0090
- blimp_sentential_negation_npi_licensor_present	1	none	0	acc	↑	0.9930	±	0.0026
- blimp_sentential_negation_npi_scope	1	none	0	acc	↑	0.7100	±	0.0144
- blimp_sentential_subject_island	1	none	0	acc	↑	0.3310	±	0.0149
- blimp_superlative_quantifiers_1	1	none	0	acc	↑	0.7800	±	0.0131
- blimp_superlative_quantifiers_2	1	none	0	acc	↑	0.7450	±	0.0138
- blimp_tough_vs_raising_1	1	none	0	acc	↑	0.5390	±	0.0158
- blimp_tough_vs_raising_2	1	none	0	acc	↑	0.8780	±	0.0104
- blimp_transitive	1	none	0	acc	↑	0.8430	±	0.0115
- blimp_wh_island	1	none	0	acc	↑	0.7190	±	0.0142
- blimp_wh_questions_object_gap	1	none	0	acc	↑	0.7590	±	0.0135
- blimp_wh_questions_subject_gap	1	none	0	acc	↑	0.9280	±	0.0082
- blimp_wh_questions_subject_gap_long_distance	1	none	0	acc	↑	0.8550	±	0.0111
- blimp_wh_vs_that_no_gap	1	none	0	acc	↑	0.9490	±	0.0070
- blimp_wh_vs_that_no_gap_long_distance	1	none	0	acc	↑	0.9490	±	0.0070
- blimp_wh_vs_that_with_gap	1	none	0	acc	↑	0.5920	±	0.0155
- blimp_wh_vs_that_with_gap_long_distance	1	none	0	acc	↑	0.3280	±	0.0149
hellaswag	1	none	0	acc	↑	0.2914	±	0.0045
		none	0	acc_norm	↑	0.3178	±	0.0046
lambada_openai	1	none	0	acc	↑	0.2591	±	0.0061
		none	0	perplexity	↓	95.5121	±	4.1325
lambada_standard	1	none	0	acc	↑	0.1716	±	0.0053
		none	0	perplexity	↓	488.2170	±	23.4634
piqa	1	none	0	acc	↑	0.6224	±	0.0113
		none	0	acc_norm	↑	0.6208	±	0.0113
sciq	1	none	0	acc	↑	0.7720	±	0.0133
		none	0	acc_norm	↑	0.6810	±	0.0147
wikitext	2	none	0	bits_per_byte	↓	1.0267	±	N/A
		none	0	byte_perplexity	↓	2.0374	±	N/A
		none	0	word_perplexity	↓	44.9548	±	N/A
winogrande	1	none	0	acc	↑	0.5099	±	0.0140

Groups	Version	Filter	n-shot	Metric		Value		Stderr
blimp	2	none		acc	↑	0.7631	±	0.0014