Supra-50M-Base / benchmarks.md
LH-Tech-AI's picture
Upload 12 files
a795f8c verified
Tasks Version Filter n-shot Metric Value Stderr
arc_easy 1 none 0 acc 0.5223 ± 0.0102
none 0 acc_norm 0.4600 ± 0.0102
blimp 2 none acc 0.7631 ± 0.0014
- blimp_adjunct_island 1 none 0 acc 0.8420 ± 0.0115
- blimp_anaphor_gender_agreement 1 none 0 acc 0.8430 ± 0.0115
- blimp_anaphor_number_agreement 1 none 0 acc 0.9620 ± 0.0060
- blimp_animate_subject_passive 1 none 0 acc 0.7820 ± 0.0131
- blimp_animate_subject_trans 1 none 0 acc 0.8040 ± 0.0126
- blimp_causative 1 none 0 acc 0.6980 ± 0.0145
- blimp_complex_NP_island 1 none 0 acc 0.4940 ± 0.0158
- blimp_coordinate_structure_constraint_complex_left_branch 1 none 0 acc 0.7420 ± 0.0138
- blimp_coordinate_structure_constraint_object_extraction 1 none 0 acc 0.7520 ± 0.0137
- blimp_determiner_noun_agreement_1 1 none 0 acc 0.9790 ± 0.0045
- blimp_determiner_noun_agreement_2 1 none 0 acc 0.9680 ± 0.0056
- blimp_determiner_noun_agreement_irregular_1 1 none 0 acc 0.8990 ± 0.0095
- blimp_determiner_noun_agreement_irregular_2 1 none 0 acc 0.9650 ± 0.0058
- blimp_determiner_noun_agreement_with_adj_2 1 none 0 acc 0.9340 ± 0.0079
- blimp_determiner_noun_agreement_with_adj_irregular_1 1 none 0 acc 0.8740 ± 0.0105
- blimp_determiner_noun_agreement_with_adj_irregular_2 1 none 0 acc 0.9270 ± 0.0082
- blimp_determiner_noun_agreement_with_adjective_1 1 none 0 acc 0.9410 ± 0.0075
- blimp_distractor_agreement_relational_noun 1 none 0 acc 0.8780 ± 0.0104
- blimp_distractor_agreement_relative_clause 1 none 0 acc 0.7210 ± 0.0142
- blimp_drop_argument 1 none 0 acc 0.7500 ± 0.0137
- blimp_ellipsis_n_bar_1 1 none 0 acc 0.8060 ± 0.0125
- blimp_ellipsis_n_bar_2 1 none 0 acc 0.8820 ± 0.0102
- blimp_existential_there_object_raising 1 none 0 acc 0.8750 ± 0.0105
- blimp_existential_there_quantifiers_1 1 none 0 acc 0.9730 ± 0.0051
- blimp_existential_there_quantifiers_2 1 none 0 acc 0.2070 ± 0.0128
- blimp_existential_there_subject_raising 1 none 0 acc 0.8810 ± 0.0102
- blimp_expletive_it_object_raising 1 none 0 acc 0.7830 ± 0.0130
- blimp_inchoative 1 none 0 acc 0.6330 ± 0.0152
- blimp_intransitive 1 none 0 acc 0.7310 ± 0.0140
- blimp_irregular_past_participle_adjectives 1 none 0 acc 0.8620 ± 0.0109
- blimp_irregular_past_participle_verbs 1 none 0 acc 0.8930 ± 0.0098
- blimp_irregular_plural_subject_verb_agreement_1 1 none 0 acc 0.8990 ± 0.0095
- blimp_irregular_plural_subject_verb_agreement_2 1 none 0 acc 0.9030 ± 0.0094
- blimp_left_branch_island_echo_question 1 none 0 acc 0.3810 ± 0.0154
- blimp_left_branch_island_simple_question 1 none 0 acc 0.6470 ± 0.0151
- blimp_matrix_question_npi_licensor_present 1 none 0 acc 0.1260 ± 0.0105
- blimp_npi_present_1 1 none 0 acc 0.5710 ± 0.0157
- blimp_npi_present_2 1 none 0 acc 0.6190 ± 0.0154
- blimp_only_npi_licensor_present 1 none 0 acc 0.6250 ± 0.0153
- blimp_only_npi_scope 1 none 0 acc 0.5360 ± 0.0158
- blimp_passive_1 1 none 0 acc 0.8770 ± 0.0104
- blimp_passive_2 1 none 0 acc 0.8840 ± 0.0101
- blimp_principle_A_c_command 1 none 0 acc 0.5560 ± 0.0157
- blimp_principle_A_case_1 1 none 0 acc 1.0000 ± 0
- blimp_principle_A_case_2 1 none 0 acc 0.9650 ± 0.0058
- blimp_principle_A_domain_1 1 none 0 acc 0.9430 ± 0.0073
- blimp_principle_A_domain_2 1 none 0 acc 0.8040 ± 0.0126
- blimp_principle_A_domain_3 1 none 0 acc 0.5200 ± 0.0158
- blimp_principle_A_reconstruction 1 none 0 acc 0.2920 ± 0.0144
- blimp_regular_plural_subject_verb_agreement_1 1 none 0 acc 0.8930 ± 0.0098
- blimp_regular_plural_subject_verb_agreement_2 1 none 0 acc 0.9110 ± 0.0090
- blimp_sentential_negation_npi_licensor_present 1 none 0 acc 0.9930 ± 0.0026
- blimp_sentential_negation_npi_scope 1 none 0 acc 0.7100 ± 0.0144
- blimp_sentential_subject_island 1 none 0 acc 0.3310 ± 0.0149
- blimp_superlative_quantifiers_1 1 none 0 acc 0.7800 ± 0.0131
- blimp_superlative_quantifiers_2 1 none 0 acc 0.7450 ± 0.0138
- blimp_tough_vs_raising_1 1 none 0 acc 0.5390 ± 0.0158
- blimp_tough_vs_raising_2 1 none 0 acc 0.8780 ± 0.0104
- blimp_transitive 1 none 0 acc 0.8430 ± 0.0115
- blimp_wh_island 1 none 0 acc 0.7190 ± 0.0142
- blimp_wh_questions_object_gap 1 none 0 acc 0.7590 ± 0.0135
- blimp_wh_questions_subject_gap 1 none 0 acc 0.9280 ± 0.0082
- blimp_wh_questions_subject_gap_long_distance 1 none 0 acc 0.8550 ± 0.0111
- blimp_wh_vs_that_no_gap 1 none 0 acc 0.9490 ± 0.0070
- blimp_wh_vs_that_no_gap_long_distance 1 none 0 acc 0.9490 ± 0.0070
- blimp_wh_vs_that_with_gap 1 none 0 acc 0.5920 ± 0.0155
- blimp_wh_vs_that_with_gap_long_distance 1 none 0 acc 0.3280 ± 0.0149
hellaswag 1 none 0 acc 0.2914 ± 0.0045
none 0 acc_norm 0.3178 ± 0.0046
lambada_openai 1 none 0 acc 0.2591 ± 0.0061
none 0 perplexity 95.5121 ± 4.1325
lambada_standard 1 none 0 acc 0.1716 ± 0.0053
none 0 perplexity 488.2170 ± 23.4634
piqa 1 none 0 acc 0.6224 ± 0.0113
none 0 acc_norm 0.6208 ± 0.0113
sciq 1 none 0 acc 0.7720 ± 0.0133
none 0 acc_norm 0.6810 ± 0.0147
wikitext 2 none 0 bits_per_byte 1.0267 ± N/A
none 0 byte_perplexity 2.0374 ± N/A
none 0 word_perplexity 44.9548 ± N/A
winogrande 1 none 0 acc 0.5099 ± 0.0140
Groups Version Filter n-shot Metric Value Stderr
blimp 2 none acc 0.7631 ± 0.0014