from dataclasses import dataclass from enum import Enum, auto from typing import List import pandas as pd tasks_df = pd.read_csv("tasks/tasks.csv") NUM_FEWSHOT = 0 class Language(Enum): ALL = auto() ES = auto() CA = auto() EU = auto() GL = auto() VA = auto() PT = auto() EN = auto() class Domain(Enum): MISCELLANEOUS = auto() LANGUAGE = auto() LEGAL = auto() CLINICAL = auto() AVERAGE = auto() @dataclass class Category: col_name: str language: Language domains: List[Domain] class Categories(Enum): es = Category("Avg ES", Language.ES, [Domain.MISCELLANEOUS, Domain.LANGUAGE, Domain.CLINICAL, Domain.LEGAL]) ca = Category("Avg CA", Language.CA, [Domain.MISCELLANEOUS, Domain.LANGUAGE, Domain.CLINICAL, Domain.LEGAL]) eu = Category("Avg EU", Language.EU, [Domain.MISCELLANEOUS, Domain.LANGUAGE, Domain.CLINICAL, Domain.LEGAL]) gl = Category("Avg GL", Language.GL, [Domain.MISCELLANEOUS, Domain.LANGUAGE, Domain.CLINICAL, Domain.LEGAL]) va = Category("Avg VA", Language.VA, [Domain.MISCELLANEOUS, Domain.LANGUAGE, Domain.CLINICAL, Domain.LEGAL]) pt = Category("Avg PT", Language.PT, [Domain.MISCELLANEOUS, Domain.LANGUAGE, Domain.CLINICAL, Domain.LEGAL]) @dataclass class Task: """ Leaderboard task. Attributes: benchmark (str): Name of the benchmark dataset, i.e. task_key in the results JSON file. metric (str): Evaluation metric used for the task, i.e. metric_key in the results JSON file. col_name (str): Column name to display in the leaderboard. language (Language): Language in which the task is conducted. domain (Domain): Domain category of the task. """ benchmark: str metric: str col_name: str language: Language domain: Domain tasks_dict = { row["Harness"]: Task( row["Harness"], ( f'{row["Metric"]},none' if row["Metric"] != "exact_match" else f'{row["Metric"]},remove_whitespace' ), # To match harness' post normalization row["Name"], Language[row["Language"].upper()], Domain[row["Domain"].upper()], ) for _, row in tasks_df.iterrows() } Tasks = Enum("Tasks", {task_name: task_obj for task_name, task_obj in tasks_dict.items()})