Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
File size: 4,960 Bytes
4ddba7f c7488db bb633bc 178c53e fbb9d41 f95776a c0b49da 3725eb1 178c53e fbb9d41 178c53e fbb9d41 62947e5 4d6faba fbb9d41 c7488db fbb9d41 079041d 3638b8c fbb9d41 f248cb9 510e4b3 8993e7a 1482380 fbb9d41 4d6faba c7488db 86f7358 c7488db f20c7d0 de2f8be f20c7d0 fbb9d41 4d6faba fbb9d41 b6de4f2 fbb9d41 de2f8be | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 | import os
from typing import Dict, List, Tuple, Optional, Any
from huggingface_hub import HfApi
# --- 1. Repository & API Configuration ---
OWNER: str = "qimma"
REPO_ID: str = f"{OWNER}/Qimma-Leaderboard"
RESULTS_REPO_ID: str = f"{OWNER}/leaderboard-results"
REQUESTS_REPO_ID: str = f"{OWNER}/leaderboard-requests"
DETAILS_REPO_ID: str = f"{OWNER}/leaderboard-details"
SLACK_WEBHOOK_URL = os.getenv("SLACK_WEBHOOK_URL", "")
hf_api_token: Optional[str] = os.getenv("HF_API_TOKEN")
API = HfApi(token=hf_api_token)
BENCHMARK_DETAILS_PAGE_SIZE: int = int(os.getenv("BENCHMARK_DETAILS_PAGE_SIZE", "120"))
# --- 2. Task Definitions ---
# Format: (dataset_key, metric_field, display_name)
RESULTS_TASKS: List[Tuple[Any, Any, str]] = [
("qimma-AraDiCE-Culture:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "AraDiCE-Culture"),
("qimma-ArabicMMLU:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "ArabicMMLU"),
("qimma-ArabCulture:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "ArabCulture"),
("qimma-PalmX:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "PALMX"),
("qimma-3lmSTEM:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "3LM STEM"),
("qimma-AraTrust|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "AraTrust"),
("qimma-Mizan|0", "normalized_mc_prob_norm", "MizanQA"),
(
["qimma-MedArabiQ:_average|0", "qimma-MedArabicQ-QA:_average|0"],
[("acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", 0.5), ("BERTScore-F", 0.5)],
"MedArabiQ",
),
("qimma-ArablegalEval-QA|0", "BERTScore-F", "ArabLegalQA"),
("qimma-GAT:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "GAT"),
("qimma-MedAraBench:_average|0", "acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)", "MedAraBench"),
]
CODE_TASKS: List[Tuple[Any, Any, str]] = [
("humaneval", "humaneval+", "HumanEval+"),
("mbpp", "mbpp+", "MBPP+"),
]
FANNFLOP_TASKS: List[Tuple[Any, Any, str]] = [
("fannorflop", "f1", "FannOrFlop"),
]
TASK_SOURCES: Dict[str, Dict[str, Any]] = {
"results": {
"prefixes": ["results"],
"tasks": RESULTS_TASKS,
},
"code": {
"prefixes": ["code"],
"tasks": CODE_TASKS,
},
"fannflop": {
# Keep this configurable for future rename.
"prefixes": ["fannflop"],
"tasks": FANNFLOP_TASKS,
},
}
TASKS: List[Tuple[Any, Any, str]] = RESULTS_TASKS + CODE_TASKS + FANNFLOP_TASKS
# Hidden tasks: extracted from result files but excluded from EVAL_COLUMNS and
# the Average calculation. Used to expose MedArabiQ subset-level scores when
# the Task Type filter is set to MCQ-only or QA-only on the frontend.
HIDDEN_TASKS: List[Tuple[Any, Any, str]] = [
(
"qimma-MedArabiQ:_average|0",
"acc:logprob_normalization=LogProbCharNorm(name='norm', ignore_first_space=False)",
"MedArabiQ_MCQ",
),
(
"qimma-MedArabicQ-QA:_average|0",
"BERTScore-F",
"MedArabiQ_QA",
),
]
# Per-benchmark metadata consumed by the frontend Task Type and Domain filters.
BENCHMARK_METADATA: Dict[str, Dict[str, str]] = {
"AraDiCE-Culture": {"domain": "Cultural", "task_type": "MCQ"},
"ArabicMMLU": {"domain": "STEM", "task_type": "MCQ"},
"ArabCulture": {"domain": "Cultural", "task_type": "MCQ"},
"PALMX": {"domain": "Cultural", "task_type": "MCQ"},
"3LM STEM": {"domain": "STEM", "task_type": "MCQ"},
"AraTrust": {"domain": "Safety", "task_type": "MCQ"},
"MizanQA": {"domain": "Legal", "task_type": "MCQ"},
"MedArabiQ": {"domain": "Medical", "task_type": "mixed"},
"ArabLegalQA": {"domain": "Legal", "task_type": "QA"},
"GAT": {"domain": "STEM", "task_type": "MCQ"},
"MedAraBench": {"domain": "Medical", "task_type": "MCQ"},
"HumanEval+": {"domain": "Coding", "task_type": "Code"},
"MBPP+": {"domain": "Coding", "task_type": "Code"},
"FannOrFlop": {"domain": "Poetry", "task_type": "QA"},
}
# --- 3. Model Metadata ---
MODEL_TYPE_TO_EMOJI: Dict[str, str] = {
# "pre-trained": "π’",
"base": "π’",
"instruct": "πΆ",
# "fine-tuned": "πΆ",
# "π’ : pretrained": "π’",
# "π© : continuously pretrained": "π©",
# "π¬ : chat models (RLHF, DPO, IFT, ...)": "π¬",
# "πΆ : fine-tuned on domain-specific datasets": "πΆ",
# "π€ : base merges and merges": "π€",
"Missing": "?",
} |