leaderboard / backend /config.py
LeenAlQadi's picture
dark mode overhaul, fix filter panel, and UI polish
de2f8be
import os
from typing import Dict, List, Tuple, Optional, Any
from huggingface_hub import HfApi
# --- 1. Repository & API Configuration ---
OWNER: str = "qimma"
REPO_ID: str = f"{OWNER}/Qimma-Leaderboard"
RESULTS_REPO_ID: str = f"{OWNER}/leaderboard-results"
REQUESTS_REPO_ID: str = f"{OWNER}/leaderboard-requests"
DETAILS_REPO_ID: str = f"{OWNER}/leaderboard-details"
SLACK_WEBHOOK_URL = os.getenv("SLACK_WEBHOOK_URL", "")
hf_api_token: Optional[str] = os.getenv("HF_API_TOKEN")
API = HfApi(token=hf_api_token)
BENCHMARK_DETAILS_PAGE_SIZE: int = int(os.getenv("BENCHMARK_DETAILS_PAGE_SIZE", "120"))
# --- 2. Task Definitions ---
# Format: (dataset_key, metric_field, display_name)
RESULTS_TASKS: List[Tuple[Any, Any, str]] = [
("qimma-AraDiCE-Culture:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "AraDiCE-Culture"),
("qimma-ArabicMMLU:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "ArabicMMLU"),
("qimma-ArabCulture:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "ArabCulture"),
("qimma-PalmX:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "PALMX"),
("qimma-3lmSTEM:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "3LM STEM"),
("qimma-AraTrust|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "AraTrust"),
("qimma-Mizan|0", "normalized_mc_prob_norm", "MizanQA"),
(
["qimma-MedArabiQ:_average|0", "qimma-MedArabicQ-QA:_average|0"],
[("acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", 0.5), ("BERTScore-F", 0.5)],
"MedArabiQ",
),
("qimma-ArablegalEval-QA|0", "BERTScore-F", "ArabLegalQA"),
("qimma-GAT:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "GAT"),
("qimma-MedAraBench:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "MedAraBench"),
]
CODE_TASKS: List[Tuple[Any, Any, str]] = [
("humaneval", "humaneval+", "HumanEval+"),
("mbpp", "mbpp+", "MBPP+"),
]
FANNFLOP_TASKS: List[Tuple[Any, Any, str]] = [
("fannorflop", "f1", "FannOrFlop"),
]
TASK_SOURCES: Dict[str, Dict[str, Any]] = {
"results": {
"prefixes": ["results"],
"tasks": RESULTS_TASKS,
},
"code": {
"prefixes": ["code"],
"tasks": CODE_TASKS,
},
"fannflop": {
# Keep this configurable for future rename.
"prefixes": ["fannflop"],
"tasks": FANNFLOP_TASKS,
},
}
TASKS: List[Tuple[Any, Any, str]] = RESULTS_TASKS + CODE_TASKS + FANNFLOP_TASKS
# Hidden tasks: extracted from result files but excluded from EVAL_COLUMNS and
# the Average calculation. Used to expose MedArabiQ subset-level scores when
# the Task Type filter is set to MCQ-only or QA-only on the frontend.
HIDDEN_TASKS: List[Tuple[Any, Any, str]] = [
(
"qimma-MedArabiQ:_average|0",
"acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)",
"MedArabiQ_MCQ",
),
(
"qimma-MedArabicQ-QA:_average|0",
"BERTScore-F",
"MedArabiQ_QA",
),
]
# Per-benchmark metadata consumed by the frontend Task Type and Domain filters.
BENCHMARK_METADATA: Dict[str, Dict[str, str]] = {
"AraDiCE-Culture": {"domain": "Cultural", "task_type": "MCQ"},
"ArabicMMLU": {"domain": "STEM", "task_type": "MCQ"},
"ArabCulture": {"domain": "Cultural", "task_type": "MCQ"},
"PALMX": {"domain": "Cultural", "task_type": "MCQ"},
"3LM STEM": {"domain": "STEM", "task_type": "MCQ"},
"AraTrust": {"domain": "Safety", "task_type": "MCQ"},
"MizanQA": {"domain": "Legal", "task_type": "MCQ"},
"MedArabiQ": {"domain": "Medical", "task_type": "mixed"},
"ArabLegalQA": {"domain": "Legal", "task_type": "QA"},
"GAT": {"domain": "STEM", "task_type": "MCQ"},
"MedAraBench": {"domain": "Medical", "task_type": "MCQ"},
"HumanEval+": {"domain": "Coding", "task_type": "Code"},
"MBPP+": {"domain": "Coding", "task_type": "Code"},
"FannOrFlop": {"domain": "Poetry", "task_type": "QA"},
}
# --- 3. Model Metadata ---
MODEL_TYPE_TO_EMOJI: Dict[str, str] = {
# "pre-trained": "🟒",
"base": "🟒",
"instruct": "πŸ”Ά",
# "fine-tuned": "πŸ”Ά",
# "🟒 : pretrained": "🟒",
# "🟩 : continuously pretrained": "🟩",
# "πŸ’¬ : chat models (RLHF, DPO, IFT, ...)": "πŸ’¬",
# "πŸ”Ά : fine-tuned on domain-specific datasets": "πŸ”Ά",
# "🀝 : base merges and merges": "🀝",
"Missing": "?",
}