Harley-ml
/

Tenete-8M

+---
+model-index:
+  - name: Tenete-8M
+    results:
+      # ====================== 0‑Shot ======================
+      # --- Multiple‑choice tasks ---
+      - task:
+          type: multiple-choice
+          name: ANLI R1 (0-Shot)
+        dataset:
+          type: anli_r1
+          name: ANLI R1
+        metrics:
+          - name: accuracy
+            type: acc
+            value: 0.3690
+      - task:
+          type: multiple-choice
+          name: ANLI R2 (0-Shot)
+        dataset:
+          type: anli_r2
+          name: ANLI R2
+        metrics:
+          - name: accuracy
+            type: acc
+            value: 0.3310
+      - task:
+          type: multiple-choice
+          name: ANLI R3 (0-Shot)
+        dataset:
+          type: anli_r3
+          name: ANLI R3
+        metrics:
+          - name: accuracy
+            type: acc
+            value: 0.3233
+      - task:
+          type: multiple-choice
+          name: ARC Challenge (0-Shot)
+        dataset:
+          type: arc_challenge
+          name: ARC Challenge
+        metrics:
+          - name: accuracy
+            type: acc
+            value: 0.1809
+          - name: accuracy_norm
+            type: acc_norm
+            value: 0.2210
+      - task:
+          type: multiple-choice
+          name: ARC Easy (0-Shot)
+        dataset:
+          type: arc_easy
+          name: ARC Easy
+        metrics:
+          - name: accuracy
+            type: acc
+            value: 0.3283
+          - name: accuracy_norm
+            type: acc_norm
+            value: 0.3194
+      - task:
+          type: multiple-choice
+          name: HellaSwag (0-Shot)
+        dataset:
+          type: hellaswag
+          name: HellaSwag
+        metrics:
+          - name: accuracy
+            type: acc
+            value: 0.2649
+          - name: accuracy_norm
+            type: acc_norm
+            value: 0.2677
+      - task:
+          type: multiple-choice
+          name: MMLU (0-Shot)
+        dataset:
+          type: mmlu
+          name: MMLU
+        metrics:
+          - name: accuracy
+            type: acc
+            value: 0.2300
+      - task:
+          type: multiple-choice
+          name: MMLU Humanities (0-Shot)
+        dataset:
+          type: mmlu
+          name: MMLU Humanities
+        metrics:
+          - name: accuracy
+            type: acc
+            value: 0.2429
+      - task:
+          type: multiple-choice
+          name: MMLU Other (0-Shot)
+        dataset:
+          type: mmlu
+          name: MMLU Other
+        metrics:
+          - name: accuracy
+            type: acc
+            value: 0.2350
+      - task:
+          type: multiple-choice
+          name: MMLU Social Sciences (0-Shot)
+        dataset:
+          type: mmlu
+          name: MMLU Social Sciences
+        metrics:
+          - name: accuracy
+            type: acc
+            value: 0.2168
+      - task:
+          type: multiple-choice
+          name: MMLU STEM (0-Shot)
+        dataset:
+          type: mmlu
+          name: MMLU STEM
+        metrics:
+          - name: accuracy
+            type: acc
+            value: 0.2185
+      - task:
+          type: multiple-choice
+          name: PiQA (0-Shot)
+        dataset:
+          type: piqa
+          name: PiQA
+        metrics:
+          - name: accuracy
+            type: acc
+            value: 0.5544
+          - name: accuracy_norm
+            type: acc_norm
+            value: 0.5571
+      - task:
+          type: multiple-choice
+          name: SWAG (0-Shot)
+        dataset:
+          type: swag
+          name: SWAG
+        metrics:
+          - name: accuracy
+            type: acc
+            value: 0.3024
+          - name: accuracy_norm
+            type: acc_norm
+            value: 0.3297
+      - task:
+          type: multiple-choice
+          name: TruthfulQA MC1 (0-Shot)
+        dataset:
+          type: truthfulqa_mc1
+          name: TruthfulQA MC1
+        metrics:
+          - name: accuracy
+            type: acc
+            value: 0.2705
+      - task:
+          type: multiple-choice
+          name: TruthfulQA MC2 (0-Shot)
+        dataset:
+          type: truthfulqa_mc2
+          name: TruthfulQA MC2
+        metrics:
+          - name: accuracy
+            type: acc
+            value: 0.4591
+      # --- Generation tasks ---
+      - task:
+          type: text-generation
+          name: GSM8K (0-Shot)
+        dataset:
+          type: gsm8k
+          name: GSM8K
+        metrics:
+          - name: exact_match (flexible-extract)
+            type: exact_match
+            value: 0.0114
+          - name: exact_match (strict-match)
+            type: exact_match
+            value: 0.0015
+      - task:
+          type: text-generation
+          name: TruthfulQA Gen (0-Shot)
+        dataset:
+          type: truthfulqa_gen
+          name: TruthfulQA Gen
+        metrics:
+          - name: bleu_acc
+            type: bleu_acc
+            value: 0.2399
+          - name: bleu_diff
+            type: bleu_diff
+            value: -1.2697
+          - name: bleu_max
+            type: bleu_max
+            value: 10.7605
+          - name: rouge1_acc
+            type: rouge1_acc
+            value: 0.2864
+          - name: rouge1_diff
+            type: rouge1_diff
+            value: -2.4981
+          - name: rouge1_max
+            type: rouge1_max
+            value: 22.1008
+          - name: rouge2_acc
+            type: rouge2_acc
+            value: 0.0979
+          - name: rouge2_diff
+            type: rouge2_diff
+            value: -1.7592
+          - name: rouge2_max
+            type: rouge2_max
+            value: 11.8332
+          - name: rougeL_acc
+            type: rougeL_acc
+            value: 0.2815
+          - name: rougeL_diff
+            type: rougeL_diff
+            value: -2.2800
+          - name: rougeL_max
+            type: rougeL_max
+            value: 20.7733
+      # ====================== 5‑Shot ======================
+      # --- Multiple‑choice tasks ---
+      - task:
+          type: multiple-choice
+          name: ANLI R1 (5-Shot)
+        dataset:
+          type: anli_r1
+          name: ANLI R1
+        metrics:
+          - name: accuracy
+            type: acc
+            value: 0.3500
+      - task:
+          type: multiple-choice
+          name: ANLI R2 (5-Shot)
+        dataset:
+          type: anli_r2
+          name: ANLI R2
+        metrics:
+          - name: accuracy
+            type: acc
+            value: 0.3340
+      - task:
+          type: multiple-choice
+          name: ANLI R3 (5-Shot)
+        dataset:
+          type: anli_r3
+          name: ANLI R3
+        metrics:
+          - name: accuracy
+            type: acc
+            value: 0.3250
+      - task:
+          type: multiple-choice
+          name: ARC Challenge (5-Shot)
+        dataset:
+          type: arc_challenge
+          name: ARC Challenge
+        metrics:
+          - name: accuracy
+            type: acc
+            value: 0.1843
+          - name: accuracy_norm
+            type: acc_norm
+            value: 0.2184
+      - task:
+          type: multiple-choice
+          name: ARC Easy (5-Shot)
+        dataset:
+          type: arc_easy
+          name: ARC Easy
+        metrics:
+          - name: accuracy
+            type: acc
+            value: 0.3380
+          - name: accuracy_norm
+            type: acc_norm
+            value: 0.3215
+      - task:
+          type: multiple-choice
+          name: HellaSwag (5-Shot)
+        dataset:
+          type: hellaswag
+          name: HellaSwag
+        metrics:
+          - name: accuracy
+            type: acc
+            value: 0.2644
+          - name: accuracy_norm
+            type: acc_norm
+            value: 0.2657
+      - task:
+          type: multiple-choice
+          name: MMLU (5-Shot)
+        dataset:
+          type: mmlu
+          name: MMLU
+        metrics:
+          - name: accuracy
+            type: acc
+            value: 0.2413
+      - task:
+          type: multiple-choice
+          name: MMLU Humanities (5-Shot)
+        dataset:
+          type: mmlu
+          name: MMLU Humanities
+        metrics:
+          - name: accuracy
+            type: acc
+            value: 0.2446
+      - task:
+          type: multiple-choice
+          name: MMLU Other (5-Shot)
+        dataset:
+          type: mmlu
+          name: MMLU Other
+        metrics:
+          - name: accuracy
+            type: acc
+            value: 0.2288
+      - task:
+          type: multiple-choice
+          name: MMLU Social Sciences (5-Shot)
+        dataset:
+          type: mmlu
+          name: MMLU Social Sciences
+        metrics:
+          - name: accuracy
+            type: acc
+            value: 0.2317
+      - task:
+          type: multiple-choice
+          name: MMLU STEM (5-Shot)
+        dataset:
+          type: mmlu
+          name: MMLU STEM
+        metrics:
+          - name: accuracy
+            type: acc
+            value: 0.2578
+      - task:
+          type: multiple-choice
+          name: PiQA (5-Shot)
+        dataset:
+          type: piqa
+          name: PiQA
+        metrics:
+          - name: accuracy
+            type: acc
+            value: 0.5560
+          - name: accuracy_norm
+            type: acc_norm
+            value: 0.5533
+      - task:
+          type: multiple-choice
+          name: SWAG (5-Shot)
+        dataset:
+          type: swag
+          name: SWAG
+        metrics:
+          - name: accuracy
+            type: acc
+            value: 0.2963
+          - name: accuracy_norm
+            type: acc_norm
+            value: 0.3201
+      - task:
+          type: multiple-choice
+          name: TruthfulQA MC1 (5-Shot)*
+        dataset:
+          type: truthfulqa_mc1
+          name: TruthfulQA MC1
+        metrics:
+          - name: accuracy
+            type: acc
+            value: 0.2705
+        # * 5‑shot setting not applicable; result identical to 0‑shot
+      - task:
+          type: multiple-choice
+          name: TruthfulQA MC2 (5-Shot)*
+        dataset:
+          type: truthfulqa_mc2
+          name: TruthfulQA MC2
+        metrics:
+          - name: accuracy
+            type: acc
+            value: 0.4591
+        # * 5‑shot setting not applicable; result identical to 0‑shot
+      # --- Generation tasks ---
+      - task:
+          type: text-generation
+          name: GSM8K (5-Shot)
+        dataset:
+          type: gsm8k
+          name: GSM8K
+        metrics:
+          - name: exact_match (flexible-extract)
+            type: exact_match
+            value: 0.0114
+          - name: exact_match (strict-match)
+            type: exact_match
+            value: 0.0015
+        # 5‑shot yields identical numbers in this run
+      - task:
+          type: text-generation
+          name: TruthfulQA Gen (5-Shot)*
+        dataset:
+          type: truthfulqa_gen
+          name: TruthfulQA Gen
+        metrics:
+          - name: bleu_acc
+            type: bleu_acc
+            value: 0.2399
+          - name: bleu_diff
+            type: bleu_diff
+            value: -1.2697
+          - name: bleu_max
+            type: bleu_max
+            value: 10.7605
+          - name: rouge1_acc
+            type: rouge1_acc
+            value: 0.2864
+          - name: rouge1_diff
+            type: rouge1_diff
+            value: -2.4981
+          - name: rouge1_max
+            type: rouge1_max
+            value: 22.1008
+          - name: rouge2_acc
+            type: rouge2_acc
+            value: 0.0979
+          - name: rouge2_diff
+            type: rouge2_diff
+            value: -1.7592
+          - name: rouge2_max
+            type: rouge2_max
+            value: 11.8332
+          - name: rougeL_acc
+            type: rougeL_acc
+            value: 0.2815
+          - name: rougeL_diff
+            type: rougeL_diff
+            value: -2.2800
+          - name: rougeL_max
+            type: rougeL_max
+            value: 20.7733
+        # * 5‑shot setting not applicable; result identical to 0‑shot
+license: mit
+language:
+- en
+tags:
+- harley-ml
+- tenete
+- small
+- sota
+- slm
+- text-generation
+---