Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
| import os | |
| from typing import Dict, List, Tuple, Optional, Any | |
| from huggingface_hub import HfApi | |
| # --- 1. Repository & API Configuration --- | |
| OWNER: str = "qimma" | |
| REPO_ID: str = f"{OWNER}/Qimma-Leaderboard" | |
| RESULTS_REPO_ID: str = f"{OWNER}/leaderboard-results" | |
| REQUESTS_REPO_ID: str = f"{OWNER}/leaderboard-requests" | |
| DETAILS_REPO_ID: str = f"{OWNER}/leaderboard-details" | |
| SLACK_WEBHOOK_URL = os.getenv("SLACK_WEBHOOK_URL", "") | |
| hf_api_token: Optional[str] = os.getenv("HF_API_TOKEN") | |
| API = HfApi(token=hf_api_token) | |
| BENCHMARK_DETAILS_PAGE_SIZE: int = int(os.getenv("BENCHMARK_DETAILS_PAGE_SIZE", "120")) | |
| # --- 2. Task Definitions --- | |
| # Format: (dataset_key, metric_field, display_name) | |
| RESULTS_TASKS: List[Tuple[Any, Any, str]] = [ | |
| ("qimma-AraDiCE-Culture:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "AraDiCE-Culture"), | |
| ("qimma-ArabicMMLU:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "ArabicMMLU"), | |
| ("qimma-ArabCulture:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "ArabCulture"), | |
| ("qimma-PalmX:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "PALMX"), | |
| ("qimma-3lmSTEM:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "3LM STEM"), | |
| ("qimma-AraTrust|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "AraTrust"), | |
| ("qimma-Mizan|0", "normalized_mc_prob_norm", "MizanQA"), | |
| ( | |
| ["qimma-MedArabiQ:_average|0", "qimma-MedArabicQ-QA:_average|0"], | |
| [("acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", 0.5), ("BERTScore-F", 0.5)], | |
| "MedArabiQ", | |
| ), | |
| ("qimma-ArablegalEval-QA|0", "BERTScore-F", "ArabLegalQA"), | |
| ("qimma-GAT:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "GAT"), | |
| ("qimma-MedAraBench:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "MedAraBench"), | |
| ] | |
| CODE_TASKS: List[Tuple[Any, Any, str]] = [ | |
| ("humaneval", "humaneval+", "HumanEval+"), | |
| ("mbpp", "mbpp+", "MBPP+"), | |
| ] | |
| FANNFLOP_TASKS: List[Tuple[Any, Any, str]] = [ | |
| ("fannorflop", "f1", "FannOrFlop"), | |
| ] | |
| TASK_SOURCES: Dict[str, Dict[str, Any]] = { | |
| "results": { | |
| "prefixes": ["results"], | |
| "tasks": RESULTS_TASKS, | |
| }, | |
| "code": { | |
| "prefixes": ["code"], | |
| "tasks": CODE_TASKS, | |
| }, | |
| "fannflop": { | |
| # Keep this configurable for future rename. | |
| "prefixes": ["fannflop"], | |
| "tasks": FANNFLOP_TASKS, | |
| }, | |
| } | |
| TASKS: List[Tuple[Any, Any, str]] = RESULTS_TASKS + CODE_TASKS + FANNFLOP_TASKS | |
| # Hidden tasks: extracted from result files but excluded from EVAL_COLUMNS and | |
| # the Average calculation. Used to expose MedArabiQ subset-level scores when | |
| # the Task Type filter is set to MCQ-only or QA-only on the frontend. | |
| HIDDEN_TASKS: List[Tuple[Any, Any, str]] = [ | |
| ( | |
| "qimma-MedArabiQ:_average|0", | |
| "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", | |
| "MedArabiQ_MCQ", | |
| ), | |
| ( | |
| "qimma-MedArabicQ-QA:_average|0", | |
| "BERTScore-F", | |
| "MedArabiQ_QA", | |
| ), | |
| ] | |
| # Per-benchmark metadata consumed by the frontend Task Type and Domain filters. | |
| BENCHMARK_METADATA: Dict[str, Dict[str, str]] = { | |
| "AraDiCE-Culture": {"domain": "Cultural", "task_type": "MCQ"}, | |
| "ArabicMMLU": {"domain": "STEM", "task_type": "MCQ"}, | |
| "ArabCulture": {"domain": "Cultural", "task_type": "MCQ"}, | |
| "PALMX": {"domain": "Cultural", "task_type": "MCQ"}, | |
| "3LM STEM": {"domain": "STEM", "task_type": "MCQ"}, | |
| "AraTrust": {"domain": "Safety", "task_type": "MCQ"}, | |
| "MizanQA": {"domain": "Legal", "task_type": "MCQ"}, | |
| "MedArabiQ": {"domain": "Medical", "task_type": "mixed"}, | |
| "ArabLegalQA": {"domain": "Legal", "task_type": "QA"}, | |
| "GAT": {"domain": "STEM", "task_type": "MCQ"}, | |
| "MedAraBench": {"domain": "Medical", "task_type": "MCQ"}, | |
| "HumanEval+": {"domain": "Coding", "task_type": "Code"}, | |
| "MBPP+": {"domain": "Coding", "task_type": "Code"}, | |
| "FannOrFlop": {"domain": "Poetry", "task_type": "QA"}, | |
| } | |
| # --- 3. Model Metadata --- | |
| MODEL_TYPE_TO_EMOJI: Dict[str, str] = { | |
| # "pre-trained": "π’", | |
| "base": "π’", | |
| "instruct": "πΆ", | |
| # "fine-tuned": "πΆ", | |
| # "π’ : pretrained": "π’", | |
| # "π© : continuously pretrained": "π©", | |
| # "π¬ : chat models (RLHF, DPO, IFT, ...)": "π¬", | |
| # "πΆ : fine-tuned on domain-specific datasets": "πΆ", | |
| # "π€ : base merges and merges": "π€", | |
| "Missing": "?", | |
| } |