File size: 4,960 Bytes
4ddba7f
c7488db
bb633bc
178c53e
fbb9d41
 
 
f95776a
c0b49da
3725eb1
178c53e
fbb9d41
178c53e
fbb9d41
 
62947e5
4d6faba
fbb9d41
 
c7488db
fbb9d41
 
 
079041d
3638b8c
fbb9d41
f248cb9
510e4b3
 
 
 
 
8993e7a
1482380
 
fbb9d41
4d6faba
c7488db
 
 
 
 
 
86f7358
c7488db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f20c7d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
de2f8be
f20c7d0
 
fbb9d41
4d6faba
fbb9d41
b6de4f2
 
 
 
 
 
 
 
 
fbb9d41
de2f8be
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import os
from typing import Dict, List, Tuple, Optional, Any
from huggingface_hub import HfApi

# --- 1. Repository & API Configuration ---
OWNER: str = "qimma"
REPO_ID: str = f"{OWNER}/Qimma-Leaderboard"
RESULTS_REPO_ID: str = f"{OWNER}/leaderboard-results"
REQUESTS_REPO_ID: str = f"{OWNER}/leaderboard-requests"
DETAILS_REPO_ID: str = f"{OWNER}/leaderboard-details"

SLACK_WEBHOOK_URL = os.getenv("SLACK_WEBHOOK_URL", "")

hf_api_token: Optional[str] = os.getenv("HF_API_TOKEN")
API = HfApi(token=hf_api_token)
BENCHMARK_DETAILS_PAGE_SIZE: int = int(os.getenv("BENCHMARK_DETAILS_PAGE_SIZE", "120"))

# --- 2. Task Definitions ---
# Format: (dataset_key, metric_field, display_name)
RESULTS_TASKS: List[Tuple[Any, Any, str]] = [
    ("qimma-AraDiCE-Culture:_average|0",    "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)",     "AraDiCE-Culture"),
    ("qimma-ArabicMMLU:_average|0",         "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)",     "ArabicMMLU"),
    ("qimma-ArabCulture:_average|0",        "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)",     "ArabCulture"),
    ("qimma-PalmX:_average|0",                  "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)",     "PALMX"),
    ("qimma-3lmSTEM:_average|0",                    "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)",     "3LM STEM"),
    ("qimma-AraTrust|0",                    "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)",     "AraTrust"),
    ("qimma-Mizan|0", "normalized_mc_prob_norm",     "MizanQA"),
    (
        ["qimma-MedArabiQ:_average|0", "qimma-MedArabicQ-QA:_average|0"],
        [("acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", 0.5), ("BERTScore-F", 0.5)],
        "MedArabiQ",
    ),
    ("qimma-ArablegalEval-QA|0", "BERTScore-F", "ArabLegalQA"),
    ("qimma-GAT:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "GAT"),
    ("qimma-MedAraBench:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "MedAraBench"),
]

CODE_TASKS: List[Tuple[Any, Any, str]] = [
    ("humaneval", "humaneval+", "HumanEval+"),
    ("mbpp", "mbpp+", "MBPP+"),
]

FANNFLOP_TASKS: List[Tuple[Any, Any, str]] = [
    ("fannorflop", "f1", "FannOrFlop"),
]

TASK_SOURCES: Dict[str, Dict[str, Any]] = {
    "results": {
        "prefixes": ["results"],
        "tasks": RESULTS_TASKS,
    },
    "code": {
        "prefixes": ["code"],
        "tasks": CODE_TASKS,
    },
    "fannflop": {
        # Keep this configurable for future rename.
        "prefixes": ["fannflop"],
        "tasks": FANNFLOP_TASKS,
    },
}

TASKS: List[Tuple[Any, Any, str]] = RESULTS_TASKS + CODE_TASKS + FANNFLOP_TASKS

# Hidden tasks: extracted from result files but excluded from EVAL_COLUMNS and
# the Average calculation.  Used to expose MedArabiQ subset-level scores when
# the Task Type filter is set to MCQ-only or QA-only on the frontend.
HIDDEN_TASKS: List[Tuple[Any, Any, str]] = [
    (
        "qimma-MedArabiQ:_average|0",
        "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)",
        "MedArabiQ_MCQ",
    ),
    (
        "qimma-MedArabicQ-QA:_average|0",
        "BERTScore-F",
        "MedArabiQ_QA",
    ),
]

# Per-benchmark metadata consumed by the frontend Task Type and Domain filters.
BENCHMARK_METADATA: Dict[str, Dict[str, str]] = {
    "AraDiCE-Culture": {"domain": "Cultural", "task_type": "MCQ"},
    "ArabicMMLU":      {"domain": "STEM",     "task_type": "MCQ"},
    "ArabCulture":     {"domain": "Cultural", "task_type": "MCQ"},
    "PALMX":           {"domain": "Cultural", "task_type": "MCQ"},
    "3LM STEM":        {"domain": "STEM",     "task_type": "MCQ"},
    "AraTrust":        {"domain": "Safety",   "task_type": "MCQ"},
    "MizanQA":         {"domain": "Legal",    "task_type": "MCQ"},
    "MedArabiQ":       {"domain": "Medical",  "task_type": "mixed"},
    "ArabLegalQA":     {"domain": "Legal",    "task_type": "QA"},
    "GAT":             {"domain": "STEM",     "task_type": "MCQ"},
    "MedAraBench":     {"domain": "Medical",  "task_type": "MCQ"},
    "HumanEval+":      {"domain": "Coding",   "task_type": "Code"},
    "MBPP+":           {"domain": "Coding",   "task_type": "Code"},
    "FannOrFlop":      {"domain": "Poetry",   "task_type": "QA"},
}

# --- 3. Model Metadata ---

MODEL_TYPE_TO_EMOJI: Dict[str, str] = {
    # "pre-trained": "🟒",
    "base": "🟒",
    "instruct": "πŸ”Ά",
    # "fine-tuned": "πŸ”Ά",
    # "🟒 : pretrained": "🟒",
    # "🟩 : continuously pretrained": "🟩",
    # "πŸ’¬ : chat models (RLHF, DPO, IFT, ...)": "πŸ’¬",
    # "πŸ”Ά : fine-tuned on domain-specific datasets": "πŸ”Ά",
    # "🀝 : base merges and merges": "🀝",
    "Missing": "?",
}