import os from typing import Dict, List, Tuple, Optional, Any from huggingface_hub import HfApi # --- 1. Repository & API Configuration --- OWNER: str = "qimma" REPO_ID: str = f"{OWNER}/Qimma-Leaderboard" RESULTS_REPO_ID: str = f"{OWNER}/leaderboard-results" REQUESTS_REPO_ID: str = f"{OWNER}/leaderboard-requests" DETAILS_REPO_ID: str = f"{OWNER}/leaderboard-details" SLACK_WEBHOOK_URL = os.getenv("SLACK_WEBHOOK_URL", "") hf_api_token: Optional[str] = os.getenv("HF_API_TOKEN") API = HfApi(token=hf_api_token) BENCHMARK_DETAILS_PAGE_SIZE: int = int(os.getenv("BENCHMARK_DETAILS_PAGE_SIZE", "120")) # --- 2. Task Definitions --- # Format: (dataset_key, metric_field, display_name) RESULTS_TASKS: List[Tuple[Any, Any, str]] = [ ("qimma-AraDiCE-Culture:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "AraDiCE-Culture"), ("qimma-ArabicMMLU:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "ArabicMMLU"), ("qimma-ArabCulture:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "ArabCulture"), ("qimma-PalmX:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "PALMX"), ("qimma-3lmSTEM:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "3LM STEM"), ("qimma-AraTrust|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "AraTrust"), ("qimma-Mizan|0", "normalized_mc_prob_norm", "MizanQA"), ( ["qimma-MedArabiQ:_average|0", "qimma-MedArabicQ-QA:_average|0"], [("acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", 0.5), ("BERTScore-F", 0.5)], "MedArabiQ", ), ("qimma-ArablegalEval-QA|0", "BERTScore-F", "ArabLegalQA"), ("qimma-GAT:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "GAT"), ("qimma-MedAraBench:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "MedAraBench"), ] CODE_TASKS: List[Tuple[Any, Any, str]] = [ ("humaneval", "humaneval+", "HumanEval+"), ("mbpp", "mbpp+", "MBPP+"), ] FANNFLOP_TASKS: List[Tuple[Any, Any, str]] = [ ("fannorflop", "f1", "FannOrFlop"), ] TASK_SOURCES: Dict[str, Dict[str, Any]] = { "results": { "prefixes": ["results"], "tasks": RESULTS_TASKS, }, "code": { "prefixes": ["code"], "tasks": CODE_TASKS, }, "fannflop": { # Keep this configurable for future rename. "prefixes": ["fannflop"], "tasks": FANNFLOP_TASKS, }, } TASKS: List[Tuple[Any, Any, str]] = RESULTS_TASKS + CODE_TASKS + FANNFLOP_TASKS # Hidden tasks: extracted from result files but excluded from EVAL_COLUMNS and # the Average calculation. Used to expose MedArabiQ subset-level scores when # the Task Type filter is set to MCQ-only or QA-only on the frontend. HIDDEN_TASKS: List[Tuple[Any, Any, str]] = [ ( "qimma-MedArabiQ:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "MedArabiQ_MCQ", ), ( "qimma-MedArabicQ-QA:_average|0", "BERTScore-F", "MedArabiQ_QA", ), ] # Per-benchmark metadata consumed by the frontend Task Type and Domain filters. BENCHMARK_METADATA: Dict[str, Dict[str, str]] = { "AraDiCE-Culture": {"domain": "Cultural", "task_type": "MCQ"}, "ArabicMMLU": {"domain": "STEM", "task_type": "MCQ"}, "ArabCulture": {"domain": "Cultural", "task_type": "MCQ"}, "PALMX": {"domain": "Cultural", "task_type": "MCQ"}, "3LM STEM": {"domain": "STEM", "task_type": "MCQ"}, "AraTrust": {"domain": "Safety", "task_type": "MCQ"}, "MizanQA": {"domain": "Legal", "task_type": "MCQ"}, "MedArabiQ": {"domain": "Medical", "task_type": "mixed"}, "ArabLegalQA": {"domain": "Legal", "task_type": "QA"}, "GAT": {"domain": "STEM", "task_type": "MCQ"}, "MedAraBench": {"domain": "Medical", "task_type": "MCQ"}, "HumanEval+": {"domain": "Coding", "task_type": "Code"}, "MBPP+": {"domain": "Coding", "task_type": "Code"}, "FannOrFlop": {"domain": "Poetry", "task_type": "QA"}, } # --- 3. Model Metadata --- MODEL_TYPE_TO_EMOJI: Dict[str, str] = { # "pre-trained": "🟢", "base": "🟢", "instruct": "🔶", # "fine-tuned": "🔶", # "🟢 : pretrained": "🟢", # "🟩 : continuously pretrained": "🟩", # "💬 : chat models (RLHF, DPO, IFT, ...)": "💬", # "🔶 : fine-tuned on domain-specific datasets": "🔶", # "🤝 : base merges and merges": "🤝", "Missing": "?", }