File size: 2,323 Bytes
251a9d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
from dataclasses import dataclass
from enum import Enum, auto
from typing import List

import pandas as pd

tasks_df = pd.read_csv("tasks/tasks.csv")


NUM_FEWSHOT = 0


class Language(Enum):
    ALL = auto()
    ES = auto()
    CA = auto()
    EU = auto()
    GL = auto()
    VA = auto()
    PT = auto()
    EN = auto()


class Domain(Enum):
    MISCELLANEOUS = auto()
    LANGUAGE = auto()
    LEGAL = auto()
    CLINICAL = auto()
    AVERAGE = auto()


@dataclass
class Category:
    col_name: str
    language: Language
    domains: List[Domain]


class Categories(Enum):
    es = Category("Avg ES", Language.ES, [Domain.MISCELLANEOUS, Domain.LANGUAGE, Domain.CLINICAL, Domain.LEGAL])
    ca = Category("Avg CA", Language.CA, [Domain.MISCELLANEOUS, Domain.LANGUAGE, Domain.CLINICAL, Domain.LEGAL])
    eu = Category("Avg EU", Language.EU, [Domain.MISCELLANEOUS, Domain.LANGUAGE, Domain.CLINICAL, Domain.LEGAL])
    gl = Category("Avg GL", Language.GL, [Domain.MISCELLANEOUS, Domain.LANGUAGE, Domain.CLINICAL, Domain.LEGAL])
    va = Category("Avg VA", Language.VA, [Domain.MISCELLANEOUS, Domain.LANGUAGE, Domain.CLINICAL, Domain.LEGAL])
    pt = Category("Avg PT", Language.PT, [Domain.MISCELLANEOUS, Domain.LANGUAGE, Domain.CLINICAL, Domain.LEGAL])


@dataclass
class Task:
    """
    Leaderboard task.

    Attributes:
        benchmark (str): Name of the benchmark dataset, i.e. task_key in the
            results JSON file.
        metric (str): Evaluation metric used for the task, i.e. metric_key in
            the results JSON file.
        col_name (str): Column name to display in the leaderboard.
        language (Language): Language in which the task is conducted.
        domain (Domain): Domain category of the task.
    """

    benchmark: str
    metric: str
    col_name: str
    language: Language
    domain: Domain


tasks_dict = {
    row["Harness"]: Task(
        row["Harness"],
        (
            f'{row["Metric"]},none' if row["Metric"] != "exact_match" else f'{row["Metric"]},remove_whitespace'
        ),  # To match harness' post normalization
        row["Name"],
        Language[row["Language"].upper()],
        Domain[row["Domain"].upper()],
    )
    for _, row in tasks_df.iterrows()
}

Tasks = Enum("Tasks", {task_name: task_obj for task_name, task_obj in tasks_dict.items()})