BryanW commited on 27 days ago

Commit

f12e61a

verified ·

1 Parent(s): a60ff7d

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Prism/LLaDA/LLaDA_Baseline/dllm_eval/api/__init__.py +0 -0
Prism/LLaDA/LLaDA_Baseline/dllm_eval/api/filter.py +56 -0
Prism/LLaDA/LLaDA_Baseline/dllm_eval/api/group.py +115 -0
Prism/LLaDA/LLaDA_Baseline/dllm_eval/api/instance.py +38 -0
Prism/LLaDA/LLaDA_Baseline/dllm_eval/api/metrics.py +578 -0
Prism/LLaDA/LLaDA_Baseline/dllm_eval/api/model.py +493 -0
Prism/LLaDA/LLaDA_Baseline/dllm_eval/api/registry.py +196 -0
Prism/LLaDA/LLaDA_Baseline/dllm_eval/api/samplers.py +232 -0
Prism/LLaDA/LLaDA_Baseline/dllm_eval/api/task.py +1881 -0
Prism/LLaDA/LLaDA_Baseline/dllm_eval/caching/__init__.py +0 -0
Prism/LLaDA/LLaDA_Baseline/dllm_eval/caching/cache.py +59 -0
Prism/LLaDA/LLaDA_Baseline/dllm_eval/decontamination/__init__.py +0 -0
Prism/LLaDA/LLaDA_Baseline/dllm_eval/decontamination/janitor.py +328 -0
Prism/LLaDA/LLaDA_Baseline/dllm_eval/loggers/__init__.py +2 -0
Prism/LLaDA/LLaDA_Baseline/dllm_eval/loggers/evaluation_tracker.py +530 -0
Prism/LLaDA/LLaDA_Baseline/dllm_eval/loggers/utils.py +149 -0
Prism/LLaDA/LLaDA_Baseline/dllm_eval/loggers/wandb_logger.py +358 -0
Prism/LLaDA/LLaDA_Baseline/dllm_eval/models/LLaDA.py +786 -0
Prism/LLaDA/LLaDA_Baseline/dllm_eval/models/__init__.py +19 -0
Prism/LLaDA/LLaDA_Baseline/dllm_eval/models/dummy.py +41 -0
Prism/LLaDA/LLaDA_Baseline/dllm_eval/models/hts_sampler.py +315 -0
Prism/LLaDA/LLaDA_Baseline/dllm_eval/models/huggingface.py +1489 -0
Prism/LLaDA/LLaDA_Baseline/dllm_eval/models/utils.py +854 -0
Prism/LLaDA/LLaDA_Baseline/dllm_eval/models/verifier.py +154 -0
Prism/LLaDA/LLaDA_Baseline/dllm_eval/prompts/__init__.py +128 -0
Prism/LLaDA/LLaDA_Baseline/dllm_eval/tasks/__init__.py +670 -0
Prism/LLaDA/LLaDA_Baseline/dllm_eval/tasks/gsm8k/gsm8k.yaml +15 -0
Prism/LLaDA/LLaDA_Baseline/dllm_eval/tasks/gsm8k/utils.py +13 -0
Prism/LLaDA/LLaDA_Baseline/dllm_eval/tasks/humaneval/humaneval.yaml +13 -0
Prism/LLaDA/LLaDA_Baseline/dllm_eval/tasks/humaneval/utils.py +43 -0
Prism/LLaDA/LLaDA_Baseline/dllm_eval/tasks/mbpp/mbpp.yaml +14 -0
Prism/LLaDA/LLaDA_Baseline/dllm_eval/tasks/mbpp/utils.py +79 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/certifi/__main__.py +12 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/frozenlist-1.5.0.dist-info/INSTALLER +1 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/frozenlist-1.5.0.dist-info/LICENSE +201 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/frozenlist-1.5.0.dist-info/METADATA +477 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/frozenlist-1.5.0.dist-info/RECORD +12 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/frozenlist-1.5.0.dist-info/WHEEL +8 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/frozenlist-1.5.0.dist-info/top_level.txt +1 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/h11-0.14.0.dist-info/INSTALLER +1 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/h11-0.14.0.dist-info/LICENSE.txt +22 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/h11-0.14.0.dist-info/METADATA +193 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/h11-0.14.0.dist-info/RECORD +52 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/h11-0.14.0.dist-info/WHEEL +5 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/h11-0.14.0.dist-info/top_level.txt +1 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/multidict-6.1.0.dist-info/INSTALLER +1 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/multidict-6.1.0.dist-info/LICENSE +13 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/multidict-6.1.0.dist-info/METADATA +140 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/multidict-6.1.0.dist-info/RECORD +19 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/multidict-6.1.0.dist-info/WHEEL +6 -0

Prism/LLaDA/LLaDA_Baseline/dllm_eval/api/__init__.py ADDED Viewed

File without changes

Prism/LLaDA/LLaDA_Baseline/dllm_eval/api/filter.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Callable, Iterable, List, Union
+from dllm_eval.api.instance import Instance
+class Filter(ABC):
+    """
+    Filter classes operate on a per-task level.
+    They take all model outputs (`instance.resps` for all `task.instances`)
+    across all instances of a task, and perform operations.
+    In a single run, one can configure any number of separate filters or lists of filters.
+    """
+    def __init__(self, **kwargs) -> None:
+        """
+        Can define custom behavior here, if an individual instantiation of a Filter class should have state.
+        """
+    @abstractmethod
+    def apply(self, resps: Union[List, Iterable], docs: List[dict]) -> Iterable:
+        """
+        Defines the operation to perform on a list of the `inst.resps` properties of `Instance` objects.
+        Should return the list of (filtered) response lists *in the same order as they were input*, e.g.
+        if pass in [<inst.resps for instance 0>, <inst.resps for instance 1>] should return
+        [<filtered resps for instance 0>, <filtered resps for instance 1>]
+        """
+        return resps
+@dataclass
+class FilterEnsemble:
+    """
+    FilterEnsemble creates a pipeline applying multiple filters.
+    Its intended usage is to stack multiple post-processing steps in order.
+    `task.apply_filters` should use a list of FilterEnsemble classes that it stores, to apply each
+    pipeline separately.
+    """
+    name: str
+    filters: List[Callable[[], Filter]]
+    def apply(self, instances: List[Instance]) -> None:
+        resps, docs = zip(*((inst.resps, inst.doc) for inst in instances))
+        resps, docs = list(resps), list(docs)
+        for f in self.filters:
+            # apply filters in sequence
+            resps = f().apply(resps, docs)
+        # add the end results after filtering to filtered_requests of their respective source instances.
+        # has key `self.name`: each FilterEnsemble applied in a given run should use a different name.
+        for inst, resp in zip(instances, resps):
+            inst.filtered_resps[self.name] = resp

Prism/LLaDA/LLaDA_Baseline/dllm_eval/api/group.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import abc
+from dataclasses import asdict, dataclass
+from inspect import getsource
+from typing import Any, Callable, List, Optional, Union
+@dataclass
+class AggMetricConfig(dict):
+    metric: Optional[str] = None
+    aggregation: Optional[str] = "mean"
+    weight_by_size: Optional[str] = False
+    # list of filter names which should be incorporated into the aggregated metric.
+    filter_list: Optional[Union[str, list]] = "none"
+    def __post_init__(self):
+        if self.aggregation != "mean" and not callable(self.aggregation):
+            raise ValueError(
+                f"Currently, 'mean' is the only pre-defined aggregation across groups' subtasks. Got '{self.aggregation}'."
+            )
+        if isinstance(self.filter_list, str):
+            self.filter_list = [self.filter_list]
+@dataclass
+class GroupConfig(dict):
+    group: Optional[str] = None
+    group_alias: Optional[str] = None
+    task: Optional[Union[str, list]] = None
+    aggregate_metric_list: Optional[
+        Union[List[AggMetricConfig], AggMetricConfig, dict]
+    ] = None
+    metadata: Optional[dict] = (
+        None  # by default, not used in the code. allows for users to pass arbitrary info to tasks
+    )
+    def __getitem__(self, item):
+        return getattr(self, item)
+    def __setitem__(self, item, value):
+        return setattr(self, item, value)
+    def __post_init__(self):
+        if self.aggregate_metric_list is not None:
+            if isinstance(self.aggregate_metric_list, dict):
+                self.aggregate_metric_list = [self.aggregate_metric_list]
+            self.aggregate_metric_list = [
+                AggMetricConfig(**item) if isinstance(item, dict) else item
+                for item in self.aggregate_metric_list
+            ]
+    def to_dict(self, keep_callable: bool = False) -> dict:
+        """dumps the current config as a dictionary object, as a printable format.
+        null fields will not be printed.
+        Used for dumping results alongside full task configuration
+        :return: dict
+            A printable dictionary version of the TaskConfig object.
+        # TODO: should any default value in the TaskConfig not be printed?
+        """
+        cfg_dict = asdict(self)
+        # remove values that are `None`
+        for k, v in list(cfg_dict.items()):
+            if callable(v):
+                cfg_dict[k] = self.serialize_function(v, keep_callable=keep_callable)
+        return cfg_dict
+    def serialize_function(
+        self, value: Union[Callable, str], keep_callable=False
+    ) -> Union[Callable, str]:
+        """Serializes a given function or string.
+        If 'keep_callable' is True, the original callable is returned.
+        Otherwise, attempts to return the source code of the callable using 'getsource'.
+        """
+        if keep_callable:
+            return value
+        else:
+            try:
+                return getsource(value)
+            except (TypeError, OSError):
+                return str(value)
+class ConfigurableGroup(abc.ABC):
+    def __init__(
+        self,
+        config: Optional[dict] = None,
+    ) -> None:
+        self._config = GroupConfig(**config)
+    @property
+    def group(self):
+        return self._config.group
+    @property
+    def group_alias(self):
+        return self._config.group_alias
+    @property
+    def version(self):
+        return self._config.version
+    @property
+    def config(self):
+        return self._config.to_dict()
+    @property
+    def group_name(self) -> Any:
+        return self._config.group
+    def __repr__(self):
+        return f"ConfigurableGroup(group={self.group},group_alias={self.group_alias})"

Prism/LLaDA/LLaDA_Baseline/dllm_eval/api/instance.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from dataclasses import dataclass, field
+from typing import Literal, Optional, Tuple
+OutputType = Literal[
+    "loglikelihood", "loglikelihood_rolling", "generate_until", "multiple_choice"
+]
+@dataclass
+class Instance:
+    request_type: OutputType
+    doc: dict
+    arguments: tuple
+    idx: int
+    metadata: Tuple[Optional[str], Optional[int], Optional[int]] = field(
+        default_factory=lambda: (None, None, None)
+    )
+    resps: list = field(default_factory=list)
+    filtered_resps: dict = field(default_factory=dict)
+    # initialized after init
+    task_name: Optional[str] = None
+    doc_id: Optional[int] = None
+    repeats: Optional[int] = None
+    def __post_init__(self) -> None:
+        # unpack metadata field
+        self.task_name, self.doc_id, self.repeats = self.metadata
+    @property
+    def args(self):
+        """
+        Returns (string,) where `string` is the string to calculate loglikelihood over
+        """
+        return (
+            self.arguments if isinstance(self.arguments, tuple) else (self.arguments,)
+        )

Prism/LLaDA/LLaDA_Baseline/dllm_eval/api/metrics.py ADDED Viewed

	@@ -0,0 +1,578 @@

+import logging
+import math
+import random
+import re
+import string
+from collections.abc import Iterable
+from typing import List
+import numpy as np
+import sacrebleu
+from dllm_eval.api.registry import register_aggregation, register_metric
+eval_logger = logging.getLogger(__name__)
+# Register Aggregations First
+@register_aggregation("bypass")
+def bypass_agg(arr):
+    return 999
+@register_aggregation("nanmean")
+def nanmean(arr):
+    if len(arr) == 0 or all(np.isnan(arr)):
+        return np.nan
+    return np.nanmean(arr)
+@register_aggregation("mean")
+def mean(arr):
+    return sum(arr) / len(arr)
+@register_aggregation("median")
+def median(arr):
+    return arr[len(arr) // 2]
+# Certain metrics must be calculated across all documents in a benchmark.
+# We use them as aggregation metrics, paired with no-op passthrough metric fns.
+@register_aggregation("perplexity")
+def perplexity(items):
+    return math.exp(-mean(items))
+@register_aggregation("weighted_perplexity")
+def weighted_perplexity(items):
+    return math.exp(-weighted_mean(items))
+@register_aggregation("bits_per_byte")
+def bits_per_byte(items):
+    return -weighted_mean(items) / math.log(2)
+@register_aggregation("f1")
+def f1_score(items):
+    from sklearn.metrics import f1_score
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds)
+    return np.max(fscore)
+@register_aggregation("matthews_corrcoef")
+def matthews_corrcoef(items):
+    from sklearn.metrics import matthews_corrcoef
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    return matthews_corrcoef(golds, preds)
+@register_aggregation("bleu")
+def bleu(items):
+    """The Bilingual Evaluation Understudy Score, or BLEU for short, is a metric
+    for evaluating a generated sentence to a reference sentence. It counts matching
+    n-grams in the candidate translation to n-grams in the reference text, where
+    1-gram or unigram would be each token and a bigram comparison would be each
+    word pair. The comparison is made regardless of word order
+    Source: https://machinelearningmastery.com/calculate-bleu-score-for-text-python/
+    Paper: https://www.aclweb.org/anthology/P02-1040/
+    Higher is better
+    """
+    refs = list(zip(*items))[0]
+    preds = list(zip(*items))[1]
+    refs, preds = _sacreformat(refs, preds)
+    return sacrebleu.corpus_bleu(preds, refs).score
+@register_aggregation("chrf")
+def chrf(items):
+    """chrF++ is a tool for automatic evaluation of machine translation output
+    based on character n-gram precision and recall enhanced with word n-grams.
+    Source: https://github.com/m-popovic/chrF
+    Paper: https://www.aclweb.org/anthology/W15-3049.pdf
+    Higher is better  # TODO I think
+    """
+    refs = list(zip(*items))[0]
+    preds = list(zip(*items))[1]
+    refs, preds = _sacreformat(refs, preds)
+    return sacrebleu.corpus_chrf(preds, refs).score
+@register_aggregation("ter")
+def ter(items):
+    """Translation Error Rate is an error metric for machine translation that
+    measures the number of edits required to change a system output into one
+    of the references
+    Source: http://www.cs.umd.edu/~snover/tercom/
+    Paper: http://mt-archive.info/AMTA-2006-Snover.pdf
+    Lower is better
+    """
+    refs = list(zip(*items))[0]
+    preds = list(zip(*items))[1]
+    refs, preds = _sacreformat(refs, preds)
+    return sacrebleu.corpus_ter(preds, refs).score
+@register_aggregation("brier_score")
+def brier_score(items):  # This is a passthrough function
+    gold, predictions = list(zip(*items))
+    bs, num_class = np.array(predictions).shape
+    gold = list(gold)
+    gold_one_hot = np.eye(num_class)[gold]
+    return np.mean(np.sum((predictions - gold_one_hot) ** 2, axis=1))
+@register_metric(
+    metric="brier_score",
+    higher_is_better=False,
+    output_type=["multiple_choice"],
+    aggregation="brier_score",
+)
+def brier_score_fn(items):  # This is a passthrough function
+    return items
+@register_metric(
+    metric="acc",
+    higher_is_better=True,
+    output_type=["loglikelihood", "multiple_choice"],
+    aggregation="mean",
+)
+def acc_fn(items):  # This is a passthrough function
+    return items
+@register_metric(
+    metric="acc_norm",
+    higher_is_better=True,
+    output_type=["loglikelihood", "multiple_choice"],
+    aggregation="mean",
+)
+def acc_norm_fn(items):  # This is a passthrough function
+    return items
+@register_metric(
+    metric="acc_mutual_info",
+    higher_is_better=True,
+    output_type="multiple_choice",
+    aggregation="mean",
+)
+def acc_mutual_info_fn(items):  # This is a passthrough function
+    return items
+### the code used in the `exact_match_hf_evaluate` function is ported from
+### https://github.com/huggingface/evaluate/blob/main/metrics/exact_match/exact_match.py
+### which is under the apache license.
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+def exact_match_hf_evaluate(
+    predictions,
+    references,
+    regexes_to_ignore=None,
+    ignore_case=False,
+    ignore_punctuation=False,
+    ignore_numbers=False,
+):
+    if regexes_to_ignore is not None:
+        for s in regexes_to_ignore:
+            predictions = np.array([re.sub(s, "", x) for x in predictions])
+            references = np.array([re.sub(s, "", x) for x in references])
+    else:
+        predictions = np.asarray(predictions)
+        references = np.asarray(references)
+    if ignore_case:
+        predictions = np.char.lower(predictions)
+        references = np.char.lower(references)
+    if ignore_punctuation:
+        repl_table = string.punctuation.maketrans("", "", string.punctuation)
+        predictions = np.char.translate(predictions, table=repl_table)
+        references = np.char.translate(references, table=repl_table)
+    if ignore_numbers:
+        repl_table = string.digits.maketrans("", "", string.digits)
+        predictions = np.char.translate(predictions, table=repl_table)
+        references = np.char.translate(references, table=repl_table)
+    score_list = predictions == references
+    return {"exact_match": np.mean(score_list)}
+###
+@register_metric(
+    metric="exact_match",
+    higher_is_better=True,
+    output_type="generate_until",
+    aggregation="mean",
+)
+def exact_match_fn(**kwargs):
+    return exact_match_hf_evaluate(**kwargs)
+@register_metric(
+    metric="perplexity",
+    higher_is_better=False,
+    output_type="loglikelihood",
+    aggregation="perplexity",
+)
+def perplexity_fn(items):  # This is a passthrough function
+    return items
+@register_metric(
+    metric="word_perplexity",
+    higher_is_better=False,
+    output_type="loglikelihood_rolling",
+    aggregation="weighted_perplexity",
+)
+def word_perplexity_fn(items):  # This is a passthrough function
+    return items
+@register_metric(
+    metric="byte_perplexity",
+    higher_is_better=False,
+    output_type="loglikelihood_rolling",
+    aggregation="weighted_perplexity",
+)
+def byte_perplexity_fn(items):  # This is a passthrough function
+    return items
+@register_metric(
+    metric="bits_per_byte",
+    higher_is_better=False,
+    output_type="loglikelihood_rolling",
+    aggregation="bits_per_byte",
+)
+def bits_per_byte_fn(items):  # This is a passthrough function
+    return items
+def pop_stddev(arr):
+    mu = mean(arr)
+    return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / len(arr))
+def sample_stddev(arr):
+    mu = mean(arr)
+    return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / (len(arr) - 1))
+def mean_stderr(arr):
+    return sample_stddev(arr) / math.sqrt(len(arr))
+@register_metric(
+    metric="bypass",
+    higher_is_better=True,
+    output_type=["loglikelihood", "multiple_choice", "generate_until"],
+    aggregation="bypass",
+)
+def bypass(items):
+    return None
+@register_metric(
+    metric="mcc",
+    higher_is_better=True,
+    output_type="multiple_choice",
+    aggregation="matthews_corrcoef",
+)
+def mcc_fn(items):  # This is a passthrough function
+    return items
+@register_metric(
+    metric="f1",
+    higher_is_better=True,
+    output_type="multiple_choice",
+    aggregation="f1",
+)
+def f1_fn(items):  # This is a passthrough function
+    return items
+@register_metric(
+    metric="bleu",
+    higher_is_better=True,
+    output_type="generate_until",
+    aggregation="bleu",
+)
+def bleu_fn(items):  # This is a passthrough function
+    return items
+@register_metric(
+    metric="chrf",
+    higher_is_better=True,
+    output_type="generate_until",
+    aggregation="chrf",
+)
+def chrf_fn(items):  # This is a passthrough function
+    return items
+@register_metric(
+    metric="ter",
+    higher_is_better=True,
+    output_type="generate_until",
+    aggregation="ter",
+)
+def ter_fn(items):  # This is a passthrough function
+    return items
+@register_metric(
+    metric="acc_all",
+    higher_is_better=True,
+    output_type="loglikelihood",
+    aggregation="mean",
+)
+def acc_all(items):
+    # Only count as correct if all answers are labeled correctly for each question
+    question_scoring_dict = {}
+    preds = list(zip(*items))[0]
+    docs = list(zip(*items))[1]
+    for doc, pred in zip(docs, preds):
+        paragraph_id = doc["idx"]["paragraph"]
+        question_id = doc["idx"]["question"]
+        if (paragraph_id, question_id) not in question_scoring_dict:
+            question_scoring_dict[(paragraph_id, question_id)] = []
+        gold_label = doc["label"] == 1
+        question_scoring_dict[(paragraph_id, question_id)].append(gold_label == pred)
+    acc = np.mean([int(all(x)) for x in question_scoring_dict.values()])
+    return acc
+def acc_all_stderr(items):
+    # Only count as correct if all answers are labeled correctly for each question
+    question_scoring_dict = {}
+    preds = list(zip(*items))[0]
+    docs = list(zip(*items))[1]
+    for doc, pred in zip(docs, preds):
+        question_id = doc["idx"]["question"]
+        if question_id not in question_scoring_dict:
+            question_scoring_dict[question_id] = []
+        gold_label = doc["label"] == 1
+        question_scoring_dict[question_id].append(gold_label == pred)
+    acc = mean_stderr([int(all(x)) for x in question_scoring_dict.values()])
+    return acc
+def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
+    """Compute max metric between prediction and each ground truth."""
+    scores_for_ground_truths = []
+    for ground_truth in ground_truths:
+        score = metric_fn(prediction, ground_truth)
+        scores_for_ground_truths.append(score)
+    return max(scores_for_ground_truths)
+def weighted_mean(items):
+    a, b = zip(*items)
+    return sum(a) / sum(b)
+def is_non_str_iterable(obj):
+    return isinstance(obj, Iterable) and not isinstance(obj, str)
+def _sacreformat(refs, preds):
+    """Format refs and preds for sacrebleu corpus calculation. It is very particular"""
+    # Sacrebleu expects (List[str], List[List[str])
+    #   e.g. sacrebleu.corpus_bleu([pred_t], [[ref1_stream], [ref2_stream], ...])
+    # Note [ref1_stream] is the first reference for each pred.
+    # So lists are size N and (M, N) for N preds and M possible refs for each pred
+    # This is a different order of dimensions that I would expect
+    # We expect refs to be List[str] or List[List[str]], the outer list corresponding to preds
+    # Must become List[List[str]] with the inner list corresponding to preds
+    if not is_non_str_iterable(refs):
+        refs = list(refs)
+    if not is_non_str_iterable(refs[0]):
+        refs = [[ref] for ref in refs]
+    refs = list(zip(*refs))
+    # Note the number of refs in each ref list much match the number of preds
+    # We expect preds to be List[str] or List[List[str]]. Must become List[str]
+    if not is_non_str_iterable(preds):
+        preds = list(preds)
+    if is_non_str_iterable(preds[0]):
+        assert len(preds[0]) == 1, f"Pred must be a str, was {preds[0]}"
+        preds = [pred[0] for pred in preds]
+    return refs, preds
+# stderr stuff
+class _bootstrap_internal:
+    def __init__(self, f, n) -> None:
+        self.f = f
+        self.n = n
+    def __call__(self, v):
+        i, xs = v
+        rnd = random.Random()
+        rnd.seed(i)
+        res = []
+        for _ in range(self.n):
+            res.append(self.f(rnd.choices(xs, k=len(xs))))
+        return res
+def bootstrap_stderr(f, xs, iters):
+    import multiprocessing as mp
+    pool = mp.Pool(mp.cpu_count())
+    # this gives a biased estimate of the stderr (i.e w/ the mean, it gives something
+    # equivalent to stderr calculated without Bessel's correction in the stddev.
+    # Unfortunately, I haven't been able to figure out what the right correction is
+    # to make the bootstrap unbiased - i considered multiplying by sqrt(n/(n-1)) but
+    # that would be ad-hoc and I can't prove that that would actually be an unbiased estimator)
+    # Thankfully, shouldn't matter because our samples are pretty big usually anyways
+    res = []
+    chunk_size = min(1000, iters)
+    from tqdm import tqdm
+    print("bootstrapping for stddev:", f.__name__)
+    for bootstrap in tqdm(
+        pool.imap(
+            _bootstrap_internal(f, chunk_size),
+            [(i, xs) for i in range(iters // chunk_size)],
+        ),
+        total=iters // chunk_size,
+    ):
+        # sample w replacement
+        res.extend(bootstrap)
+    pool.close()
+    return sample_stddev(res)
+def stderr_for_metric(metric, bootstrap_iters: int):
+    if bootstrap_iters <= 0:
+        # return no function (don't compute stderr) if bootstrap iters = 0
+        return None
+    bootstrappable = [
+        median,
+        matthews_corrcoef,
+        f1_score,
+        perplexity,
+        bleu,
+        chrf,
+        ter,
+        nanmean,
+    ]
+    if metric in bootstrappable:
+        return lambda x: bootstrap_stderr(metric, x, iters=bootstrap_iters)
+    stderr = {mean: mean_stderr, acc_all: acc_all_stderr}
+    return stderr.get(metric, None)
+def pooled_sample_stderr(stderrs: List[float], sizes: List[int]):
+    # Used to aggregate bootstrapped stderrs across subtasks in a group,
+    # when we are weighting by the size of each subtask.
+    #
+    assert len(stderrs) == len(sizes)
+    # formula source: https://en.wikipedia.org/wiki/Pooled_variance
+    # and: https://stats.stackexchange.com/a/4841331
+    # this empirically seems to match running `stderr_for_metric` on all instances
+    # from the subtasks concatenated with each other.
+    pooled_sample_var = (
+        sum([(size - 1) * stderr**2 * size for size, stderr in zip(sizes, stderrs)])
+    ) / (sum(sizes) - len(sizes))
+    return np.sqrt(pooled_sample_var / sum(sizes))
+def combined_sample_stderr(stderrs: List[float], sizes: List[int], metrics=None):
+    assert metrics is not None, (
+        "Need to pass a list of each subtask's metric for this stderr aggregation"
+    )
+    assert len(stderrs) == len(sizes) and len(sizes) == len(metrics)
+    # See https://github.com/EleutherAI/lm-evaluation-harness/pull/1390 for more documentation.
+    # This formula depends on sample means.
+    # removed because it seems to give erroneously huge stderrs for groupings of tasks
+    # and does not seem to match up with bootstrap-calculated stderrs for groups.
+    ### don't use this unless a statistician has told you it's the right thing to do ###
+    # accumulators: we'll aggregate pairwise N - 1 times
+    variance = stderrs[0] ** 2
+    curr_size = sizes[0]
+    curr_score = metrics[0]
+    for stderr, size, score in zip(stderrs[1:], sizes[1:], metrics[1:]):
+        curr_score = ((curr_score * curr_size) + (score * size)) / (
+            curr_size + size
+        )  # NOTE: this assumes our aggregation fn is "mean"
+        variance = ((curr_size - 1) * variance + (size - 1) * (stderr**2)) / (
+            curr_size + size - 1
+        ) + curr_size * size / ((curr_size + size) * (curr_size + size - 1)) * (
+            curr_score - score
+        ) ** 2
+    return np.sqrt(variance)
+def aggregate_subtask_metrics(metrics, sizes, weight_by_size=True):
+    # A helper function that is used to aggregate
+    # subtask scores cross-task.
+    # TODO: does not hold for non-mean aggregations
+    if not weight_by_size:
+        sizes = [1] * len(sizes)
+    assert len(metrics) == len(sizes)
+    return sum([metric * size for metric, size in zip(metrics, sizes)]) / sum(sizes)

Prism/LLaDA/LLaDA_Baseline/dllm_eval/api/model.py ADDED Viewed

	@@ -0,0 +1,493 @@

+import abc
+import hashlib
+import json
+import logging
+import os
+from typing import Dict, List, Optional, Tuple, Type, TypeVar, Union
+import transformers
+from sqlitedict import SqliteDict
+from tqdm import tqdm
+from dllm_eval import utils
+eval_logger = logging.getLogger(__name__)
+T = TypeVar("T", bound="LM")
+class LM(abc.ABC):
+    def __init__(self) -> None:
+        """Defines the interface that should be implemented by all LM subclasses.
+        LMs are assumed to take text (strings) as input and yield strings as output
+        (inputs/outputs should be tokenization-agnostic.)
+        """
+        # set rank and world size to a single process, by default.
+        self._rank = 0
+        self._world_size = 1
+        self.cache_hook = CacheHook(None)
+    @abc.abstractmethod
+    def loglikelihood(self, requests) -> List[Tuple[float, bool]]:
+        """Compute log-likelihood of generating a continuation from a context.
+        Downstream tasks should attempt to use loglikelihood instead of other
+        LM calls whenever possible.
+        :param requests: list[Instance]
+            A list of Instance objects, with property `args` which returns a tuple (context, continuation).
+            `context: str`
+                Context string. Implementations of LM must be able to handle an
+                empty context string.
+            `continuation: str`
+                The continuation over which log likelihood will be calculated. If
+                there is a word boundary, the space should be in the continuation.
+                For example, context="hello" continuation=" world" is correct.
+        :return: list[tuple[float, bool]]
+            A list of pairs (logprob, isgreedy)
+            `logprob: float`
+                The log probability of `continuation`.
+            `isgreedy`:
+                Whether `continuation` would be generated by greedy sampling from `context`.
+        """
+        pass
+    @abc.abstractmethod
+    def loglikelihood_rolling(self, requests) -> List[float]:
+        """Compute full log-likelihood of a string, with no truncation, for perplexity computation
+        - We will use the full max context length of the model.
+        - For inputs that exceed the max context length, we divide the tokenized string into chunks of up to
+        the max context length.
+        - IMPORTANT: Each document's loglikelihood/perplexity is computed *separately*, unlike other implementations
+          which may simply concatenate multiple documents together.
+        - IMPORTANT: We maximize the amount of context for each prediction. Specifically, for inputs that we break into
+          multiple chunks, the last input will still a full-sized context.
+          Example:
+            Input tokens: [ 0 1 2 3 4 5 6 7 8 9 ]
+            Prefix: BOS/EOS
+            Max context length: 4
+            Resulting input/prediction pairs:
+                INPUT:  BOS   0   1   2
+                PRED:     0   1   2   3
+                INPUT:    3   4   5   6
+                PRED:     4   5   6   7
+                INPUT:    5   6   7   8
+                PRED:             8   9
+          Observe that:
+            1. Each token is predicted exactly once
+            2. For the last pair, we provide the full context, but only score the last two tokens
+        :param requests: list[Instance]
+            A list of Instance objects with property `args` which returns a tuple (context,).
+            string: str
+                String for which we are computing overall loglikelihood
+        :return: list[tuple[float]]
+            A list of tuples (logprob,)
+            logprob: float
+                The log probability of `context` conditioned on the BOS/EOS token.
+                Can also be overridden for custom cases by `prefix_token_id`.
+        """
+        pass
+    # TODO: Add an optional max length
+    @abc.abstractmethod
+    def generate_until(self, requests) -> List[str]:
+        """Generate greedily until a stopping sequence
+        :param requests: list[Instance]
+            A list of Instance objects with property `args` which returns a tuple (context, gen_kwargs).
+            context: str
+                Context string
+            gen_kwargs: dict
+                A dictionary of keyword arguments to pass to the generation function e.g. top_k, until, etc.
+        :return: list[str]
+            A list of model generated continuations.
+            continuation: str
+                The generated continuation.
+        """
+        pass
+    def apply_chat_template(
+        self, chat_history: List[Dict[str, str]], add_generation_prompt=True
+    ) -> str:
+        """
+        Defines how to transform few-shot examples provided as chat history into a format that can be used as input to the LM.
+        :param chat_history: list[dict[str, str]]
+            A list of dictionaries with keys 'role' and 'content'.
+            Values are strings representing the role name and the content of the message, respectively.
+        :param add_generation_prompt: bool
+            Whether to append an assistant gen prefix (for e.g. <|assistant|>) to the assistant messages in the chat history. False if prefilling an assistant message.
+        :return: str
+            A string representing the chat history in a format that can be used as input to the LM.
+        """
+        raise NotImplementedError(
+            "To use this model with chat templates, please implement the 'apply_chat_template' method for your model type."
+        )
+    @classmethod
+    def create_from_arg_string(
+        cls: Type[T], arg_string: str, additional_config: Optional[dict] = None
+    ) -> T:
+        """
+        Creates an instance of the LM class using the given argument string and additional config.
+        Parameters:
+        - arg_string: A string containing arguments in the format key1=value1,key2=value2.
+        - additional_config: Optional dictionary containing additional configuration parameters.
+        Returns:
+        - Instance of the LM class.
+        """
+        additional_config = {} if additional_config is None else additional_config
+        args = utils.simple_parse_args_string(arg_string)
+        args2 = {k: v for k, v in additional_config.items() if v is not None}
+        return cls(**args, **args2)
+    @classmethod
+    def create_from_arg_obj(
+        cls: Type[T], arg_dict: dict, additional_config: Optional[dict] = None
+    ) -> T:
+        """
+        Creates an instance of the LM class using the given arg_obj
+        Parameters:
+        - arg_obj: A dict containing arguments in the format key1=value1,key2=value2.
+        - additional_config: Optional dictionary containing additional configuration parameters.
+        Returns:
+        - Instance of the LM class.
+        """
+        additional_config = {} if additional_config is None else additional_config
+        additional_config = {
+            k: v for k, v in additional_config.items() if v is not None
+        }
+        return cls(**arg_dict, **additional_config)
+    @property
+    def rank(self):
+        # used in the case of parallelism. Hardcoded to
+        # ensure no errors arise using API models which do
+        # not support multi-device parallelism nor expect it.
+        return self._rank
+    @property
+    def world_size(self):
+        # used in the case of parallelism. Hardcoded to
+        # ensure no errors arise using API models which do
+        # not support multi-device parallelism nor expect it.
+        return self._world_size
+    @property
+    def tokenizer_name(self) -> str:
+        """Must be defined for LM subclasses which implement Chat Templating.
+        Should return the name of the tokenizer or chat template used.
+        Used only to properly fingerprint caches when requests are being cached with `--cache_requests`, otherwise not used.
+        """
+        raise NotImplementedError(
+            "To use this model with chat templates, please implement the 'tokenizer_name' property."
+        )
+    def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]:
+        """Returns the chat template structure for user/assistant messages if a template is provided.
+        This method is intended to be overridden in a subclass to define a specific chat template format.
+        For models that do not support chat templates, this method returns None by default.
+        """
+        return ""
+    def set_cache_hook(self, cache_hook) -> None:
+        self.cache_hook = cache_hook
+### SQLite-based caching of LM responses
+def hash_args(attr, args):
+    dat = json.dumps([attr] + list(args))
+    return hashlib.sha256(dat.encode("utf-8")).hexdigest()
+class CacheHook:
+    def __init__(self, cachinglm) -> None:
+        if cachinglm is None:
+            self.dbdict = None
+            return
+        self.dbdict = cachinglm.dbdict
+    def add_partial(self, attr, req, res) -> None:
+        if self.dbdict is None:
+            return
+        hsh = hash_args(attr, req)
+        self.dbdict[hsh] = res
+class CachingLM:
+    def __init__(self, lm, cache_db) -> None:
+        """LM wrapper that returns cached results if they exist, and uses the underlying LM if not.
+        :param lm: LM
+            Underlying LM
+        :param cache_db: str
+            Path to cache db
+        """
+        self.lm = lm
+        self.cache_db = cache_db
+        if os.path.dirname(cache_db):
+            os.makedirs(os.path.dirname(cache_db), exist_ok=True)
+        self.dbdict = SqliteDict(cache_db, autocommit=True)
+        # add hook to lm
+        lm.set_cache_hook(self.get_cache_hook())
+    def __getattr__(self, attr: str):
+        lm_attr = getattr(self.lm, attr)
+        if attr not in ["loglikelihood", "loglikelihood_rolling", "generate_until"]:
+            eval_logger.debug(f"Passing through attribute '{attr}' to underlying LM")
+            return lm_attr
+        def fn(requests):
+            res = []
+            remaining_reqs = []
+            warned = False
+            # figure out which ones are cached and which ones are new
+            eval_logger.info(
+                f"Loading '{attr}' responses from cache '{self.cache_db}' where possible..."
+            )
+            for req in tqdm(requests, desc="Checking cached requests"):
+                hsh = hash_args(attr, req.args)
+                if attr == "generate_until" and req.args[1].get("do_sample", False):
+                    # when we are doing non-greedy generation, don't use the cache
+                    # (else every "randomly sampled" generation would be identical for repeats > 1).
+                    if not warned:
+                        eval_logger.warning(
+                            f"Arguments to lm.generate_until() '{req.args[1]}' include non-deterministic sampling. Caching will not be performed for such requests."
+                        )
+                        warned = True
+                    res.append(None)
+                    remaining_reqs.append(req)
+                elif hsh in self.dbdict:
+                    ob = self.dbdict[hsh]
+                    assert ob is not None
+                    res.append(ob)
+                else:
+                    res.append(None)
+                    remaining_reqs.append(req)
+            eval_logger.info(
+                f"Cached requests: {len(requests) - len(remaining_reqs)}, Requests remaining: {len(remaining_reqs)}"
+            )
+            if remaining_reqs:
+                # actually run the LM on the requests that do not have cached results
+                rem_res = getattr(self.lm, attr)(remaining_reqs)
+            else:
+                rem_res = []
+            # stick the new ones back into the list and also cache any of the new ones
+            resptr = 0
+            for req, r in zip(remaining_reqs, rem_res):
+                while res[resptr] is not None:
+                    resptr += 1
+                res[resptr] = r
+                # caching
+                hsh = hash_args(attr, req.args)
+                self.dbdict[hsh] = r
+            self.dbdict.commit()
+            return res
+        return fn
+    def get_cache_hook(self):
+        return CacheHook(self)
+class TemplateLM(LM):
+    """
+    A class acting as intermediary between the LM base class
+    and boilerplate often included in other LM subclasses.
+    """
+    tokenizer = None
+    @property
+    @abc.abstractmethod
+    def eot_token_id(self):
+        pass
+    @property
+    def prefix_token_id(self):
+        # it is used as prefix for loglikelihood
+        return self.eot_token_id
+    @abc.abstractmethod
+    def tok_encode(self, string: str, **kwargs) -> List[int]:
+        """
+        Tokenize a string using the model's tokenizer and return a list of token IDs.
+        """
+        pass
+    @abc.abstractmethod
+    def _loglikelihood_tokens(self, requests, **kwargs) -> List[Tuple[float, bool]]:
+        pass
+    def _encode_pair(
+        self, context: str, continuation: str
+    ) -> Tuple[List[int], List[int]]:
+        n_spaces = len(context) - len(context.rstrip())
+        if n_spaces > 0:
+            continuation = context[-n_spaces:] + continuation
+            context = context[:-n_spaces]
+        model_class = getattr(self, "AUTO_MODEL_CLASS", None)
+        if model_class == transformers.AutoModelForSeq2SeqLM:
+            context_enc = self.tok_encode(context)
+            continuation_enc = self.tok_encode(continuation, add_special_tokens=False)
+        else:
+            whole_enc = self.tok_encode(context + continuation)
+            context_enc = self.tok_encode(context)
+            context_enc_len = len(context_enc)
+            continuation_enc = whole_enc[context_enc_len:]
+        return context_enc, continuation_enc
+    def loglikelihood(
+        self, requests, disable_tqdm: bool = False
+    ) -> List[Tuple[float, bool]]:
+        new_reqs = []
+        for context, continuation in [req.args for req in requests]:
+            if context == "":
+                # BOS or EOS as context
+                context_enc, continuation_enc = (
+                    [self.prefix_token_id],
+                    self.tok_encode(continuation),
+                )
+            else:
+                context_enc, continuation_enc = self._encode_pair(context, continuation)
+            new_reqs.append(((context, continuation), context_enc, continuation_enc))
+        return self._loglikelihood_tokens(new_reqs, disable_tqdm=disable_tqdm)
+    @abc.abstractmethod
+    def loglikelihood_rolling(
+        self, requests, disable_tqdm: bool = False
+    ) -> List[float]:
+        pass
+    @abc.abstractmethod
+    def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
+        pass
+    def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]:
+        """
+        Set and get the appropriate chat template for the model.
+        This method sets the tokenizer's chat_template and returns the template string for reproducibility.
+        The template selection logic is adapted from the Transformers library's `apply_chat_template`
+        method in the Tokenizer class. The original implementation can be found at:
+        https://github.com/huggingface/transformers/blob/fc35907f95459d7a6c5281dfadd680b6f7b620e3/src/transformers/tokenization_utils_base.py#L1687
+        This method ensures that the right template is chosen based on the following:
+        0. If the model has no 'tokenizer' attribute: assumes that there is only a single possible chat template, handled on the model provider side internally. Returns the empty string.
+        1. If the model's tokenizer has multiple templates:
+            a. Use the specified template if it exists in the dictionary.
+            b. Use the default template from the list if no specific template is provided.
+            c. Raise an error if no default template exists and no specific template is provided.
+        2. If the model's tokenizer has a single template or no template:
+            a. Use the tokenizer's chat template if available.
+            b. Fall back to the default chat template if no tokenizer chat template exists.
+        Args:
+            chat_template (Union[bool, str]): Specifies the chat template to use.
+                - If False or None, no template is applied.
+                - If True, the default or only available template is used.
+                - If a string, the template with the matching name is used.
+        Returns:
+            Optional[str]: The selected chat template, or None if no template is applied.
+        """
+        if self.tokenizer is None:
+            return ""
+        if chat_template is False or chat_template is None:
+            eval_logger.warning(
+                "model.chat_template was called with the chat_template set to False or None. "
+                "Therefore no chat template will be applied. Make sure this is an intended behavior."
+            )
+            return None
+        # Convert boolean chat_template to None to ensure compatibility with the adapted logic
+        if isinstance(chat_template, bool):
+            chat_template = None
+        using_default_template = False
+        # First, handle the cases when the model has a dict of multiple templates
+        try:
+            template = (
+                self.tokenizer.chat_template or self.tokenizer.default_chat_template
+            )
+        except AttributeError:
+            return None
+        if isinstance(template, dict):
+            using_default_dict = self.tokenizer.chat_template is None
+            if chat_template is not None:
+                if chat_template in template:
+                    selected_template = template[chat_template]
+                    if using_default_dict:
+                        using_default_template = True
+                else:
+                    raise ValueError(
+                        f"The specified chat template '{chat_template}' is not available. "
+                        f"Available template names are {sorted(template.keys())}."
+                    )
+            else:
+                # If user didn't pass a chat template, use the default template from the dict
+                if "default" in template:
+                    selected_template = template["default"]
+                    using_default_template = True
+                else:
+                    raise ValueError(
+                        "This model has multiple chat templates with no default specified! Please either pass a chat "
+                        "template or the name of the template you wish to use to the `chat_template` argument. Available "
+                        f"template names are {sorted(template.keys())}."
+                    )
+        # Cases when the model has a single template or no template
+        else:
+            # priority: `chat_template` argument > `tokenizer.chat_template` > `tokenizer.default_chat_template
+            if isinstance(chat_template, str):
+                eval_logger.warning(
+                    "Chat template name provided, but the tokenizer's chat template is not a dictionary. "
+                    "Using the tokenizer's chat template or the default template instead."
+                )
+            if self.tokenizer.chat_template is not None:
+                selected_template = self.tokenizer.chat_template
+            else:
+                selected_template = self.tokenizer.default_chat_template
+                using_default_template = True
+        if using_default_template:
+            eval_logger.warning(
+                "No chat template is set for this tokenizer, falling back to a default class-level template. This is "
+                "very error-prone, because models are often trained with templates different from the class default! "
+                "Default chat templates are a legacy feature and will be removed in Transformers v4.43, at which "
+                "point any code depending on them will stop working. We recommend setting a valid chat template before "
+                "then to ensure that this model continues working without issues."
+            )
+        return selected_template

Prism/LLaDA/LLaDA_Baseline/dllm_eval/api/registry.py ADDED Viewed

	@@ -0,0 +1,196 @@

+import logging
+from typing import Callable, Dict, Union
+import evaluate as hf_evaluate
+from dllm_eval.api.model import LM
+eval_logger = logging.getLogger(__name__)
+MODEL_REGISTRY = {}
+def register_model(*names):
+    # either pass a list or a single alias.
+    # function receives them as a tuple of strings
+    def decorate(cls):
+        for name in names:
+            assert issubclass(cls, LM), (
+                f"Model '{name}' ({cls.__name__}) must extend LM class"
+            )
+            assert name not in MODEL_REGISTRY, (
+                f"Model named '{name}' conflicts with existing model! Please register with a non-conflicting alias instead."
+            )
+            MODEL_REGISTRY[name] = cls
+        return cls
+    return decorate
+def get_model(model_name):
+    try:
+        return MODEL_REGISTRY[model_name]
+    except KeyError:
+        raise ValueError(
+            f"Attempted to load model '{model_name}', but no model for this name found! Supported model names: {', '.join(MODEL_REGISTRY.keys())}"
+        )
+TASK_REGISTRY = {}
+GROUP_REGISTRY = {}
+ALL_TASKS = set()
+func2task_index = {}
+def register_task(name):
+    def decorate(fn):
+        assert name not in TASK_REGISTRY, (
+            f"task named '{name}' conflicts with existing registered task!"
+        )
+        TASK_REGISTRY[name] = fn
+        ALL_TASKS.add(name)
+        func2task_index[fn.__name__] = name
+        return fn
+    return decorate
+def register_group(name):
+    def decorate(fn):
+        func_name = func2task_index[fn.__name__]
+        if name in GROUP_REGISTRY:
+            GROUP_REGISTRY[name].append(func_name)
+        else:
+            GROUP_REGISTRY[name] = [func_name]
+            ALL_TASKS.add(name)
+        return fn
+    return decorate
+OUTPUT_TYPE_REGISTRY = {}
+METRIC_REGISTRY = {}
+METRIC_AGGREGATION_REGISTRY = {}
+AGGREGATION_REGISTRY: Dict[str, Callable[[], Dict[str, Callable]]] = {}
+HIGHER_IS_BETTER_REGISTRY = {}
+FILTER_REGISTRY = {}
+DEFAULT_METRIC_REGISTRY = {
+    "loglikelihood": [
+        "perplexity",
+        "acc",
+    ],
+    "loglikelihood_rolling": ["word_perplexity", "byte_perplexity", "bits_per_byte"],
+    "multiple_choice": ["acc", "acc_norm"],
+    "generate_until": ["exact_match"],
+}
+def register_metric(**args):
+    # TODO: do we want to enforce a certain interface to registered metrics?
+    def decorate(fn):
+        assert "metric" in args
+        name = args["metric"]
+        for key, registry in [
+            ("metric", METRIC_REGISTRY),
+            ("higher_is_better", HIGHER_IS_BETTER_REGISTRY),
+            ("aggregation", METRIC_AGGREGATION_REGISTRY),
+        ]:
+            if key in args:
+                value = args[key]
+                assert value not in registry, (
+                    f"{key} named '{value}' conflicts with existing registered {key}!"
+                )
+                if key == "metric":
+                    registry[name] = fn
+                elif key == "aggregation":
+                    registry[name] = AGGREGATION_REGISTRY[value]
+                else:
+                    registry[name] = value
+        return fn
+    return decorate
+def get_metric(name: str, hf_evaluate_metric=False) -> Callable:
+    if not hf_evaluate_metric:
+        if name in METRIC_REGISTRY:
+            return METRIC_REGISTRY[name]
+        else:
+            eval_logger.warning(
+                f"Could not find registered metric '{name}' in lm-eval, searching in HF Evaluate library..."
+            )
+    try:
+        metric_object = hf_evaluate.load(name)
+        return metric_object.compute
+    except Exception:
+        eval_logger.error(
+            f"{name} not found in the evaluate library! Please check https://huggingface.co/evaluate-metric",
+        )
+def register_aggregation(name: str):
+    def decorate(fn):
+        assert name not in AGGREGATION_REGISTRY, (
+            f"aggregation named '{name}' conflicts with existing registered aggregation!"
+        )
+        AGGREGATION_REGISTRY[name] = fn
+        return fn
+    return decorate
+def get_aggregation(name: str) -> Callable[[], Dict[str, Callable]]:
+    try:
+        return AGGREGATION_REGISTRY[name]
+    except KeyError:
+        eval_logger.warning(f"{name} not a registered aggregation metric!")
+def get_metric_aggregation(name: str) -> Callable[[], Dict[str, Callable]]:
+    try:
+        return METRIC_AGGREGATION_REGISTRY[name]
+    except KeyError:
+        eval_logger.warning(f"{name} metric is not assigned a default aggregation!")
+def is_higher_better(metric_name) -> bool:
+    try:
+        return HIGHER_IS_BETTER_REGISTRY[metric_name]
+    except KeyError:
+        eval_logger.warning(
+            f"higher_is_better not specified for metric '{metric_name}'!"
+        )
+def register_filter(name):
+    def decorate(cls):
+        if name in FILTER_REGISTRY:
+            eval_logger.info(
+                f"Registering filter `{name}` that is already in Registry {FILTER_REGISTRY}"
+            )
+        FILTER_REGISTRY[name] = cls
+        return cls
+    return decorate
+def get_filter(filter_name: Union[str, Callable]) -> Callable:
+    try:
+        return FILTER_REGISTRY[filter_name]
+    except KeyError as e:
+        if callable(filter_name):
+            return filter_name
+        else:
+            eval_logger.warning(f"filter `{filter_name}` is not registered!")
+            raise e

Prism/LLaDA/LLaDA_Baseline/dllm_eval/api/samplers.py ADDED Viewed

	@@ -0,0 +1,232 @@

+import logging
+import warnings
+from functools import partial
+from typing import TYPE_CHECKING, Iterable, Optional, Union
+import datasets
+if TYPE_CHECKING:
+    from random import Random
+    from dllm_eval.api.task import ConfigurableTask, Task
+eval_logger = logging.getLogger("lm-eval")
+class ContextSampler:
+    def __init__(
+        self,
+        docs: list[dict],
+        task: Union["Task", "ConfigurableTask"],
+        fewshot_indices: Optional[Iterable] = None,
+        rnd: Optional["Random"] = None,
+    ) -> None:
+        self.rnd = rnd
+        if not self.rnd:
+            raise ValueError(
+                "A `random.Random` generator argument must be provided to `rnd` of FewShotSampler!"
+            )
+        self.task = task
+        self.config = task._config
+        self.target_delimiter = self.config.target_delimiter
+        self.fewshot_delimiter = self.config.fewshot_delimiter
+        if (
+            self.config.fewshot_config is not None
+            and self.config.fewshot_config.get("doc_to_text", None) is not None
+        ):
+            self.doc_to_text = partial(
+                self.task.doc_to_text,
+                doc_to_text=self.config.fewshot_config.get("doc_to_text", None),
+            )
+        else:
+            self.doc_to_text = self.task.doc_to_text
+        if (
+            self.config.fewshot_config is not None
+            and self.config.fewshot_config.get("doc_to_target", None) is not None
+        ):
+            self.doc_to_target = partial(
+                self.task.doc_to_target,
+                doc_to_target=self.config.fewshot_config.get("doc_to_target", None),
+            )
+        else:
+            self.doc_to_target = self.task.doc_to_target
+        if (
+            self.config.fewshot_config is not None
+            and self.config.fewshot_config.get("doc_to_choice", None) is not None
+        ):
+            self.doc_to_choice = partial(
+                self.task.doc_to_choice,
+                doc_to_choice=self.config.fewshot_config.get("doc_to_choice", None),
+            )
+        else:
+            self.doc_to_choice = self.task.doc_to_choice
+        self.docs = docs  # HF dataset split, provided by task._fewshot_docs()
+        if fewshot_indices:  # subset few-shot docs from
+            if not isinstance(self.docs, datasets.Dataset):
+                raise ValueError(
+                    "Got `fewshot_indices` but fewshot_docs are not a HF dataset. Don't use both `fewshot_indices` and a user-defined few-shot sample list simultaneously"
+                )
+            self.docs = self.docs.select(fewshot_indices)
+    def get_context(self, doc: dict, num_fewshot: int, gen_prefix: str = None):
+        # draw an extra fewshot sample if using same split as evaluating on
+        prefix = gen_prefix + " " if gen_prefix else ""
+        n_samples = (
+            num_fewshot + 1
+            if self.config.fewshot_split == self.config.test_split
+            else num_fewshot
+        )
+        # draw `n_samples` docs from fewshot_docs
+        fewshotex = self.sample(n_samples)
+        # get rid of the doc that's the one we're evaluating, if it's in the fewshot
+        # TODO: should we just stop people from using fewshot from same split as evaluating?
+        selected_docs = [x for x in fewshotex if x != doc][:num_fewshot]
+        labeled_examples = ""
+        for doc in selected_docs:
+            doc_content = self.doc_to_text(doc)
+            doc_target = self.doc_to_target(doc)
+            if self.config.doc_to_choice is None or isinstance(doc_content, str):
+                labeled_examples += doc_content
+            else:
+                labeled_examples += self.doc_to_choice(doc)[doc_content]
+            if doc_target != "":
+                if self.target_delimiter.isspace() and str(doc_target)[0].isspace():
+                    # TODO: add logger warn once here.
+                    warnings.warn(
+                        "Both target_delimiter and target start with a space. This may cause issues.",
+                        Warning,
+                        stacklevel=2,
+                    )
+                labeled_examples += self.target_delimiter
+                labeled_examples += prefix
+                labeled_examples += (
+                    str(doc_target[0])
+                    if isinstance(doc_target, list)
+                    else doc_target
+                    if self.config.doc_to_choice is None or isinstance(doc_target, str)
+                    else str(self.doc_to_choice(doc)[doc_target])
+                )
+                labeled_examples += self.fewshot_delimiter
+        return labeled_examples
+    def get_chat_context(
+        self,
+        doc: dict,
+        num_fewshot: int,
+        fewshot_as_multiturn: bool = False,
+        gen_prefix: Optional[str] = None,
+    ):
+        # TODO: Do we need any other delimiter
+        prefix = gen_prefix + " " if gen_prefix else ""
+        chat_history = []
+        # draw an extra fewshot sample if using same split as evaluating on
+        n_samples = (
+            num_fewshot + 1
+            if self.config.fewshot_split == self.config.test_split
+            else num_fewshot
+        )
+        # draw `n_samples` docs from fewshot_docs
+        fewshotex = self.sample(n_samples)
+        # get rid of the doc that's the one we're evaluating, if it's in the fewshot
+        # TODO: should we just stop people from using fewshot from same split as evaluating?
+        selected_docs = [x for x in fewshotex if x != doc][:num_fewshot]
+        if fewshot_as_multiturn:
+            for doc in selected_docs:
+                doc_content = self.doc_to_text(doc)
+                doc_target = self.doc_to_target(doc)
+                chat_history.append(
+                    {
+                        "role": "user",
+                        "content": doc_content
+                        if self.config.doc_to_choice is None
+                        or isinstance(doc_content, str)
+                        else self.doc_to_choice(doc)[doc_content],
+                    }
+                )
+                chat_history.append(
+                    {
+                        "role": "assistant",
+                        "content": prefix + str(doc_target[0])
+                        if isinstance(doc_target, list)
+                        else prefix + doc_target
+                        if self.config.doc_to_choice is None
+                        or isinstance(doc_target, str)
+                        else prefix + str(self.doc_to_choice(doc)[doc_target]),
+                    }
+                )
+        else:
+            # get fewshot context as one user turn
+            chat_history.append(
+                {
+                    "role": "user",
+                    "content": self.get_context(
+                        doc, num_fewshot, gen_prefix=gen_prefix
+                    ),
+                }
+            )
+        return chat_history
+    def sample(self, n: int):
+        """
+        Draw `n` samples from our fewshot docs. This method should be overridden by subclasses.
+        """
+        return self.rnd.sample(self.docs, n)
+class FirstNSampler(ContextSampler):
+    def sample(self, n: int) -> None:
+        """
+        Draw the first `n` samples in order from the specified split.
+        Used for tasks with "canonical" ordered fewshot examples, such as MMLU and CMMLU.
+        """
+        assert n <= len(self.docs), (
+            f"Error: number of fewshot samples requested exceeds the {len(self.docs)} that are available."
+        )
+        return self.docs[:n]
+class BalancedSampler(ContextSampler):
+    def sample(self, n: int) -> None:
+        """
+        TODO: this should return approximately class-balanced samples from our fewshot examples.
+        TODO: what order should they be in? maybe random?
+        """
+        pass
+class ManualSampler(ContextSampler):
+    def sample(self, n: int) -> None:
+        """ """
+        pass
+SAMPLER_REGISTRY = {
+    "default": ContextSampler,
+    "first_n": FirstNSampler,
+}
+def get_sampler(name: str):
+    try:
+        return SAMPLER_REGISTRY[name]
+    except KeyError:
+        raise ValueError(
+            f"Attempted to use contextsampler '{name}', but no sampling strategy for this name found! Supported model names: {', '.join(SAMPLER_REGISTRY.keys())}"
+        )

Prism/LLaDA/LLaDA_Baseline/dllm_eval/api/task.py ADDED Viewed

	@@ -0,0 +1,1881 @@

+import abc
+import ast
+import logging
+import random
+import re
+from collections.abc import Callable
+from copy import deepcopy
+from dataclasses import asdict, dataclass
+from inspect import getsource
+from typing import (
+    Any,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    Literal,
+    Mapping,
+    Optional,
+    Tuple,
+    Union,
+)
+import datasets
+import numpy as np
+from tqdm import tqdm
+from dllm_eval import utils
+from dllm_eval.api import samplers
+from dllm_eval.api.instance import Instance, OutputType
+from dllm_eval.api.metrics import bits_per_byte, mean, weighted_perplexity
+from dllm_eval.api.registry import (
+    AGGREGATION_REGISTRY,
+    DEFAULT_METRIC_REGISTRY,
+    get_aggregation,
+    get_metric,
+    get_metric_aggregation,
+    is_higher_better,
+)
+from dllm_eval.caching.cache import load_from_cache, save_to_cache
+from dllm_eval.filters import build_filter_ensemble
+from dllm_eval.prompts import get_prompt
+ALL_OUTPUT_TYPES = [
+    "loglikelihood",
+    "multiple_choice",
+    "loglikelihood_rolling",
+    "generate_until",
+]
+eval_logger = logging.getLogger(__name__)
+@dataclass
+class TaskConfig(dict):
+    # task naming/registry
+    task: Optional[str] = None
+    task_alias: Optional[str] = None
+    tag: Optional[Union[str, list]] = None
+    # HF dataset options.
+    # which dataset to use,
+    # and what splits for what purpose
+    custom_dataset: Optional[Callable] = None
+    dataset_path: Optional[str] = None
+    dataset_name: Optional[str] = None
+    dataset_kwargs: Optional[dict] = None
+    training_split: Optional[str] = None
+    validation_split: Optional[str] = None
+    test_split: Optional[str] = None
+    fewshot_split: Optional[str] = (
+        None  # TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaluating (?)
+    )
+    # formatting / prompting options.
+    # see docs/advanced_task_guide.md for more info
+    process_docs: Optional[Callable] = None
+    doc_to_text: Optional[Union[Callable, str]] = None
+    doc_to_target: Optional[Union[Callable, str]] = None
+    doc_to_image: Union[Callable, str] = None
+    doc_to_audio: Union[Callable, str] = None
+    unsafe_code: bool = False
+    doc_to_choice: Optional[Union[Callable, str, dict, list]] = None
+    process_results: Optional[Union[Callable, str]] = None
+    use_prompt: Optional[str] = None
+    description: str = ""
+    target_delimiter: str = " "
+    fewshot_delimiter: str = "\n\n"
+    fewshot_config: Optional[dict] = None
+    # runtime configuration options
+    num_fewshot: Optional[int] = None
+    # scoring options
+    metric_list: Optional[list] = None
+    output_type: OutputType = "generate_until"
+    generation_kwargs: Optional[dict] = None
+    repeats: int = 1
+    filter_list: Optional[Union[str, list]] = None
+    should_decontaminate: bool = False
+    doc_to_decontamination_query: Optional[str] = None
+    gen_prefix: Optional[str] = None
+    metadata: Optional[dict] = (
+        None  # by default, not used in the code. allows for users to pass arbitrary info to tasks
+    )
+    def __post_init__(self) -> None:
+        if self.generation_kwargs is not None:
+            if self.output_type != "generate_until":
+                eval_logger.warning(
+                    f"[{self.task}] passed `generation_kwargs`, but not using `output_type: generate_until`!"
+                )
+            if "temperature" in self.generation_kwargs:
+                self.generation_kwargs["temperature"] = float(
+                    self.generation_kwargs["temperature"]
+                )
+            if "until" not in self.generation_kwargs:
+                eval_logger.warning(
+                    f"{self.task}: No `until` specified in `generation_kwargs`! Defaulting to the fewshot_delimiter={repr(self.fewshot_delimiter)}"
+                )
+                self.generation_kwargs["until"] = [self.fewshot_delimiter]
+        else:
+            if self.output_type == "generate_until":
+                # ensure that we greedily generate in absence of explicit arguments otherwise
+                self.generation_kwargs = {
+                    "until": (
+                        None
+                        if self.fewshot_delimiter is None
+                        else [self.fewshot_delimiter]
+                    ),
+                    "do_sample": False,
+                    "temperature": 0,
+                }
+                eval_logger.warning(
+                    f"{self.task}: No `generation_kwargs` specified in task config, defaulting to {self.generation_kwargs}"
+                )
+    def __getitem__(self, item):
+        return getattr(self, item)
+    def __setitem__(self, item, value):
+        return setattr(self, item, value)
+    def to_dict(self, keep_callable: bool = False) -> dict:
+        """dumps the current config as a dictionary object, as a printable format.
+        null fields will not be printed.
+        Used for dumping results alongside full task configuration
+        :return: dict
+            A printable dictionary version of the TaskConfig object.
+        # TODO: should any default value in the TaskConfig not be printed?
+        """
+        cfg_dict = asdict(self)
+        # remove values that are `None`
+        for k, v in list(cfg_dict.items()):
+            if v is None:
+                cfg_dict.pop(k)
+            elif k == "metric_list":
+                for metric_dict in v:
+                    for metric_key, metric_value in metric_dict.items():
+                        if callable(metric_value):
+                            metric_dict[metric_key] = self.serialize_function(
+                                metric_value, keep_callable=keep_callable
+                            )
+                cfg_dict[k] = v
+            elif callable(v):
+                cfg_dict[k] = self.serialize_function(v, keep_callable=keep_callable)
+        return cfg_dict
+    def serialize_function(
+        self, value: Union[Callable, str], keep_callable=False
+    ) -> Union[Callable, str]:
+        """Serializes a given function or string.
+        If 'keep_callable' is True, the original callable is returned.
+        Otherwise, attempts to return the source code of the callable using 'getsource'.
+        """
+        if keep_callable:
+            return value
+        else:
+            try:
+                return getsource(value)
+            except (TypeError, OSError):
+                return str(value)
+class Task(abc.ABC):
+    """A task represents an entire benchmark including its dataset, problems,
+    answers, and evaluation methods. See BoolQ for a simple example implementation
+    A `doc` can be any python object which represents one instance of evaluation.
+    This is usually a dictionary e.g.
+        {"question": ..., "answer": ...} or
+        {"question": ..., question, answer)
+    """
+    VERSION: Optional[Union[int, str]] = None
+    # The name of the `Task` benchmark as denoted in the HuggingFace datasets Hub
+    # or a path to a custom `datasets` loading script.
+    DATASET_PATH: Optional[str] = None
+    # The name of a subset within `DATASET_PATH`.
+    DATASET_NAME: Optional[str] = None
+    OUTPUT_TYPE: Optional[OutputType] = None
+    def __init__(
+        self,
+        data_dir: Optional[str] = None,
+        cache_dir: Optional[str] = None,
+        download_mode: Optional[datasets.DownloadMode] = None,
+        config: Optional[Mapping] = None,  # Union[dict, TaskConfig]
+    ) -> None:
+        """
+        :param data_dir: str
+            Stores the path to a local folder containing the `Task`'s data files.
+            Use this to specify the path to manually downloaded data (usually when
+            the dataset is not publicly accessible).
+        :param cache_dir: str
+            The directory to read/write the `Task` dataset. This follows the
+            HuggingFace `datasets` API with the default cache directory located at:
+                `~/.cache/huggingface/datasets`
+            NOTE: You can change the cache location globally for a given process
+            to another directory:
+                `export HF_DATASETS_CACHE="/path/to/another/directory"`
+        :param download_mode: datasets.DownloadMode
+            How to treat pre-existing `Task` downloads and data.
+            - `datasets.DownloadMode.REUSE_DATASET_IF_EXISTS`
+                Reuse download and reuse dataset.
+            - `datasets.DownloadMode.REUSE_CACHE_IF_EXISTS`
+                Reuse download with fresh dataset.
+            - `datasets.DownloadMode.FORCE_REDOWNLOAD`
+                Fresh download and fresh dataset.
+        """
+        self.download(data_dir, cache_dir, download_mode)
+        self._training_docs: Optional[list] = None
+        self._fewshot_docs: Optional[list] = None
+        self._instances: Optional[List[Instance]] = None
+        self._config: TaskConfig = TaskConfig({**config}) if config else TaskConfig()
+        self._filters = [build_filter_ensemble("none", [["take_first", None]])]
+        self.fewshot_rnd: Optional[random.Random] = (
+            None  # purposely induce errors in case of improper usage
+        )
+    def download(
+        self,
+        data_dir: Optional[str] = None,
+        cache_dir: Optional[str] = None,
+        download_mode=None,
+    ) -> None:
+        """Downloads and returns the task dataset.
+        Override this method to download the dataset from a custom API.
+        :param data_dir: str
+            Stores the path to a local folder containing the `Task`'s data files.
+            Use this to specify the path to manually downloaded data (usually when
+            the dataset is not publicly accessible).
+        :param cache_dir: str
+            The directory to read/write the `Task` dataset. This follows the
+            HuggingFace `datasets` API with the default cache directory located at:
+                `~/.cache/huggingface/datasets`
+            NOTE: You can change the cache location globally for a given process
+            by setting the shell environment variable, `HF_DATASETS_CACHE`,
+            to another directory:
+                `export HF_DATASETS_CACHE="/path/to/another/directory"`
+        :param download_mode: datasets.DownloadMode
+            How to treat pre-existing `Task` downloads and data.
+            - `datasets.DownloadMode.REUSE_DATASET_IF_EXISTS`
+                Reuse download and reuse dataset.
+            - `datasets.DownloadMode.REUSE_CACHE_IF_EXISTS`
+                Reuse download with fresh dataset.
+            - `datasets.DownloadMode.FORCE_REDOWNLOAD`
+                Fresh download and fresh dataset.
+        """
+        self.dataset = datasets.load_dataset(
+            path=self.DATASET_PATH,
+            name=self.DATASET_NAME,
+            data_dir=data_dir,
+            cache_dir=cache_dir,
+            download_mode=download_mode,
+        )
+    @property
+    def config(self) -> TaskConfig:
+        """Returns the TaskConfig associated with this class."""
+        return self._config
+    @abc.abstractmethod
+    def has_training_docs(self):
+        """Whether the task has a training set"""
+        pass
+    @abc.abstractmethod
+    def has_validation_docs(self):
+        """Whether the task has a validation set"""
+        pass
+    @abc.abstractmethod
+    def has_test_docs(self):
+        """Whether the task has a test set"""
+        pass
+    def training_docs(self) -> Iterable:
+        """
+        :return: Iterable[obj]
+            A iterable of any object, that doc_to_text can handle
+        """
+        return []
+    def validation_docs(self) -> Iterable:
+        """
+        :return: Iterable[obj]
+            A iterable of any object, that doc_to_text can handle
+        """
+        return []
+    def test_docs(self) -> Iterable:
+        """
+        :return: Iterable[obj]
+            A iterable of any object, that doc_to_text can handle
+        """
+        return []
+    def fewshot_docs(self) -> Iterable:
+        """
+        :return: Iterable[obj]
+            A iterable of any object, that doc_to_text can handle
+        """
+        if self.has_training_docs():
+            return self.training_docs()
+        elif self.has_validation_docs():
+            return self.validation_docs()
+        else:
+            if self.config.get("num_fewshot", 0) > 0:
+                eval_logger.warning(
+                    f"[Task: {self.config.task}] has_training_docs and has_validation_docs are False"
+                    ", using test_docs as fewshot_docs but this is not recommended."
+                )
+            return self.test_docs()
+    def _process_doc(self, doc: dict) -> dict:
+        """
+        Override this to process (detokenize, strip, replace, etc.) individual
+        documents. This can be used in a map over documents of a data split.
+        E.g. `map(self._process_doc, self.dataset["validation"])`
+        :return: dict
+            The processed version of the specified `doc`.
+        """
+        return doc
+    @property
+    def instances(self) -> List[Instance]:
+        """After calling `task.build_all_requests()`, tasks
+        maintain a list of the dataset instances which will be evaluated.
+        """
+        return self._instances
+    def fewshot_examples(self, k, rnd):
+        if self._training_docs is None:
+            self._training_docs = list(self.training_docs())
+        return rnd.sample(self._training_docs, k)
+    def doc_to_decontamination_query(self, doc):
+        raise NotImplementedError(
+            "Override doc_to_decontamination_query with document specific decontamination query."
+        )
+    @abc.abstractmethod
+    def doc_to_text(self, doc):
+        pass
+    @abc.abstractmethod
+    def doc_to_target(self, doc):
+        pass
+    # not an abstractmethod because not every language-only task has to implement this
+    def doc_to_image(self, doc):
+        raise NotImplementedError
+    def doc_to_audio(self, doc):
+        raise NotImplementedError
+    def doc_to_prefix(self, doc):
+        return ""
+    def build_all_requests(
+        self,
+        *,
+        limit: Union[int, None] = None,
+        samples: Optional[List[int]] = None,
+        rank: int = 0,
+        world_size: int = 1,
+        cache_requests: bool = False,
+        rewrite_requests_cache: bool = False,
+        system_instruction: Optional[str] = None,
+        apply_chat_template: bool = False,
+        fewshot_as_multiturn: bool = False,
+        chat_template: Optional[Callable] = None,
+        tokenizer_name: str = "",
+    ) -> None:
+        """Build a set of Instances for a task, and store them in task.instances"""
+        # used with caching
+        og_limit = limit
+        cache_key = f"requests-{self._config.task}-{self.config.num_fewshot}shot-rank{rank}-world_size{world_size}"
+        cache_key += "-chat_template" if apply_chat_template else ""
+        cache_key += "-fewshot_as_multiturn" if fewshot_as_multiturn else ""
+        cache_key += (
+            f"-system_prompt_hash{utils.hash_string(system_instruction)}"
+            if system_instruction is not None
+            else ""
+        )
+        cache_key += f"-tokenizer{tokenizer_name}"
+        cached_instances = load_from_cache(file_name=cache_key, cache=cache_requests)
+        if cache_requests and cached_instances and not rewrite_requests_cache:
+            cached_instances = cached_instances[:limit]
+            flattened_instances = [
+                instance
+                for instance_group in cached_instances
+                for instance in instance_group
+            ]
+            self._instances = flattened_instances
+            return
+        eval_logger.info(f"Building contexts for {self.config.task} on rank {rank}...")
+        instances = []
+        # process all documents when caching is specified for simplicity
+        if (
+            cache_requests
+            and (not cached_instances or rewrite_requests_cache)
+            and limit is not None
+        ):
+            limit = None
+        doc_id_docs = list(
+            self.doc_iterator(
+                rank=rank, limit=limit, samples=samples, world_size=world_size
+            )
+        )
+        num_docs = len(doc_id_docs)
+        for doc_id, doc in tqdm(
+            doc_id_docs,
+            total=num_docs,
+        ):
+            # sample fewshot context #TODO: need to offset doc_id by rank now!
+            fewshot_ctx = self.fewshot_context(
+                doc,
+                num_fewshot=0
+                if self.config.num_fewshot is None
+                else self.config.num_fewshot,
+                system_instruction=system_instruction,
+                apply_chat_template=apply_chat_template,
+                fewshot_as_multiturn=fewshot_as_multiturn,
+                chat_template=chat_template,
+                gen_prefix=self.doc_to_prefix(doc),
+            )
+            # TODO: we should override self.config.repeats if doing greedy gen so users don't waste time+compute
+            inst = self.construct_requests(
+                doc=doc,
+                ctx=fewshot_ctx,
+                metadata=(self.config["task"], doc_id, self.config.repeats),
+                apply_chat_template=apply_chat_template,
+                chat_template=chat_template,
+            )
+            if not isinstance(inst, list):
+                inst = [inst]
+            instances.append(inst)
+        # now flatten, this is to allow slicing to work with pickles
+        sliced_instances = instances[:og_limit]
+        flattened_instances = [
+            instance
+            for instance_group in sliced_instances
+            for instance in instance_group
+        ]
+        self._instances = flattened_instances
+        if len(self._instances) == 0:
+            raise ValueError("task.build_requests() did not find any docs!")
+        if cache_requests and (not cached_instances or rewrite_requests_cache):
+            save_to_cache(file_name=cache_key, obj=instances)
+    @abc.abstractmethod
+    def construct_requests(self, doc, ctx, **kwargs):
+        """Uses RequestFactory to construct Requests and returns an iterable of
+        Requests which will be sent to the LM.
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`.
+        :param doc_idx: int
+            The index of a document within `self.test_docs()` or `self.validation_docs()`,
+            whichever is the main split used.
+        :param repeats: int
+        TODO: update this docstring
+            The number of times each instance in a dataset is inferred on. Defaults to 1,
+            can be increased for techniques like majority voting.
+        """
+        pass
+    @abc.abstractmethod
+    def process_results(self, doc, results):
+        """Take a single document and the LM results and evaluates, returning a
+        dict where keys are the names of submetrics and values are the values of
+        the metric for that one document
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        pass
+    @abc.abstractmethod
+    def aggregation(self):
+        """
+        :returns: {str: [metric_score] -> float}
+            A dictionary where keys are the names of submetrics and values are
+            functions that aggregate a list of metric scores
+        """
+        pass
+    @abc.abstractmethod
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are
+            whether a higher value of the submetric is better
+        """
+        pass
+    def get_config(self, key: str) -> Any:
+        return getattr(self._config, key, None)
+    @classmethod
+    def count_bytes(cls, doc):
+        """Used for byte-level perplexity metrics in rolling loglikelihood"""
+        return len(doc.encode("utf-8"))
+    @classmethod
+    def count_words(cls, doc):
+        """Downstream loglikelihood_rolling perplexity tasks with custom word boundaries should override this!"""
+        return len(re.split(r"\s+", doc))
+    @utils.positional_deprecated
+    def fewshot_context(self, doc, num_fewshot, rnd=None, description=None, **kwargs):
+        """Returns a fewshot context string that is made up of a prepended description
+        (if provided), the `num_fewshot` number of examples, and an appended prompt example.
+        :param doc: str
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param num_fewshot: int
+            The number of fewshot examples to provide in the returned context string.
+        :param rnd: random.Random
+            The pseudo-random number generator used to randomly sample examples.
+            WARNING: This is currently a required arg although it's optionalized with a default `None`.
+        :param description: str
+            The task's description that will be prepended to the fewshot examples.
+        :returns: str
+            The fewshot context.
+        """
+        if rnd is None:
+            if self.fewshot_rnd is not None:
+                rnd = self.fewshot_rnd
+            else:
+                raise ValueError(
+                    "A `random.Random` generator argument must be provided to `rnd`"
+                )
+        description = description if description else ""
+        if num_fewshot == 0:
+            labeled_examples = ""
+        else:
+            # for sets with no training docs, draw from other set *but ensure no overlap with current doc*
+            if self.has_training_docs():
+                fewshotex = self.fewshot_examples(k=num_fewshot, rnd=rnd)
+            else:
+                if self._fewshot_docs is None:
+                    self._fewshot_docs = list(
+                        self.validation_docs()
+                        if self.has_validation_docs()
+                        else self.test_docs()
+                    )
+                fewshotex = rnd.sample(self._fewshot_docs, num_fewshot + 1)
+                # get rid of the doc that's the one we're evaluating, if it's in the fewshot
+                fewshotex = [x for x in fewshotex if x != doc][:num_fewshot]
+            labeled_examples = (
+                "\n\n".join(
+                    [
+                        self.doc_to_text(doc) + self.doc_to_target(doc)
+                        for doc in fewshotex
+                    ]
+                )
+                + "\n\n"
+            )
+        example = self.doc_to_text(doc)
+        return description + labeled_examples + example
+    def apply_filters(self) -> Optional[List[Instance]]:
+        """Iterates over FilterEnsembles and applies them to instances"""
+        if hasattr(self, "_filters"):
+            for f in self._filters:
+                f.apply(self._instances)
+        else:
+            eval_logger.warning("No filter defined, passing through instances")
+            return self._instances
+    def dump_config(self) -> dict:
+        """Returns the config as a dictionary."""
+        # TODO: this should only return the overrides applied to a non-YAML task's configuration.
+        # (num_fewshot)
+        return self.config.to_dict()
+    def set_config(self, key: str, value: Any, update: bool = False) -> None:
+        """Set or update the configuration for a given key."""
+        if key is None:
+            raise ValueError("Key must be provided.")
+        if update:
+            current_value = getattr(self._config, key, {})
+            if not isinstance(current_value, dict):
+                raise TypeError(
+                    f"Expected a dict for key '{key}', got {type(current_value).__name__} instead."
+                )
+            current_value.update(value)
+        else:
+            setattr(self._config, key, value)
+    def override_metric(self, metric_name: str) -> None:
+        """
+        Override the default metrics used for evaluation with custom metrics.
+        Parameters:
+        - metric_name (str): The name of the custom metric to override. Should be registered in api.metrics.
+        """
+        (
+            self._metric_fn_list,
+            self._aggregation_list,
+            self._metric_fn_kwargs,
+            self._higher_is_better,
+        ) = ({}, {}, {}, {})
+        self._metric_fn_list[metric_name] = get_metric(metric_name)
+        self._aggregation_list[metric_name] = get_metric_aggregation(metric_name)
+        self._higher_is_better[metric_name] = is_higher_better(metric_name)
+        self._metric_fn_kwargs[metric_name] = {}
+        if not isinstance(self, ConfigurableTask):
+            self.process_results = lambda x, y: {metric_name: get_metric(metric_name)}
+            self.aggregation = lambda: {
+                metric_name: get_metric_aggregation(metric_name)
+            }
+        setattr(self._config, "metric_list", [{"metric": metric_name}])
+        setattr(self._config, "process_results", None)
+    def set_fewshot_seed(self, seed: Optional[int] = None) -> None:
+        self.fewshot_rnd = random.Random(seed)
+        if hasattr(self, "sampler"):
+            self.sampler.rnd = self.fewshot_rnd
+    @property
+    def eval_docs(self) -> Union[datasets.Dataset, List[dict]]:
+        if self.has_test_docs():
+            return self.test_docs()
+        elif self.has_validation_docs():
+            return self.validation_docs()
+        else:
+            raise ValueError(
+                f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"
+            )
+    def doc_iterator(
+        self,
+        *,
+        rank: int = 0,
+        limit: Union[int, None] = None,
+        world_size: int = 1,
+        samples: Optional[List[int]] = None,
+    ) -> Iterator[Tuple[int, Any]]:
+        if samples:
+            n = len(self.eval_docs)
+            assert all([e < n for e in samples]), (
+                f"Elements of --samples should be in the interval [0,k-1] where k is the number of total examples. In this case, k={n}."
+            )
+            eval_logger.info(
+                f"{self.config.task}: Evaluating on {len(samples)} examples"
+            )
+            doc_iterator = utils.create_iterator(
+                enumerate(x for i, x in enumerate(self.eval_docs) if i in samples),
+                rank=int(rank),
+                limit=None,  # limit does not matter here since we are selecting samples directly
+                world_size=int(world_size),
+            )
+        else:
+            limit = int(limit) if limit else None
+            doc_iterator = utils.create_iterator(
+                enumerate(self.eval_docs),
+                rank=int(rank),
+                limit=limit,
+                world_size=int(world_size),
+            )
+        return doc_iterator
+class ConfigurableTask(Task):
+    VERSION = "Yaml"
+    OUTPUT_TYPE = None
+    CONFIG = None
+    def __init__(
+        self,
+        data_dir=None,
+        cache_dir=None,
+        download_mode=None,
+        config: Optional[dict] = None,
+    ) -> None:  # TODO no super() call here
+        # Get pre-configured attributes
+        self._config = self.CONFIG
+        # Use new configurations if there was no preconfiguration
+        if self.config is None:
+            self._config = TaskConfig(**config)
+        # Overwrite configs
+        else:
+            if config is not None:
+                self._config.__dict__.update(config)
+        if self.config is None:
+            raise ValueError(
+                "Must pass a config to ConfigurableTask, either in cls.CONFIG or `config` kwarg"
+            )
+        if isinstance(self.config.metadata, dict):
+            if "version" in self.config.metadata:
+                self.VERSION = self.config.metadata["version"]
+        if self.config.output_type is not None:
+            if self.config.output_type not in ALL_OUTPUT_TYPES:
+                raise ValueError(
+                    f"Got invalid output_type '{self.config.output_type}', must be in '{','.join(ALL_OUTPUT_TYPES)}'"
+                )
+            self.OUTPUT_TYPE = self.config.output_type
+        if self.config.doc_to_image is not None:
+            # mark the task as requiring multimodality.
+            self.MULTIMODAL = True
+        if self.config.doc_to_audio:
+            # mark the task as requiring multimodality.
+            self.MULTIMODAL = True
+        if self.config.unsafe_code is not False:
+            self.UNSAFE_CODE = True
+        if self.config.dataset_path is not None:
+            self.DATASET_PATH = self.config.dataset_path
+        if self.config.dataset_name is not None:
+            self.DATASET_NAME = self.config.dataset_name
+        self._metric_fn_list = {}
+        self._metric_fn_kwargs = {}
+        self._aggregation_list = {}
+        self._higher_is_better = {}
+        if self.config.metric_list is None:
+            # TODO: handle this in TaskConfig.__post_init__ ?
+            _metric_list = DEFAULT_METRIC_REGISTRY[self.config.output_type]
+            for metric_name in _metric_list:
+                self._metric_fn_list[metric_name] = get_metric(metric_name)
+                self._metric_fn_kwargs[metric_name] = {}
+                self._aggregation_list[metric_name] = get_metric_aggregation(
+                    metric_name
+                )
+                self._higher_is_better[metric_name] = is_higher_better(metric_name)
+        else:
+            for metric_config in self.config.metric_list:
+                if "metric" not in metric_config:
+                    raise ValueError(
+                        "'metric' key not provided for an entry in 'metric_list', must be specified!"
+                    )
+                metric_name = metric_config["metric"]
+                kwargs = {
+                    key: metric_config[key]
+                    for key in metric_config
+                    if key
+                    not in ["metric", "aggregation", "higher_is_better", "hf_evaluate"]
+                }
+                hf_evaluate_metric = (
+                    "hf_evaluate" in metric_config
+                    and metric_config["hf_evaluate"] is True
+                )
+                if self.config.process_results is not None:
+                    self._metric_fn_list[metric_name] = None
+                    self._metric_fn_kwargs[metric_name] = {}
+                elif callable(metric_name):
+                    metric_fn = metric_name.__call__
+                    metric_name = metric_name.__name__
+                    self._metric_fn_list[metric_name] = metric_fn
+                    self._metric_fn_kwargs[metric_name] = kwargs
+                else:
+                    self._metric_fn_list[metric_name] = get_metric(
+                        metric_name, hf_evaluate_metric
+                    )
+                    self._metric_fn_kwargs[metric_name] = kwargs
+                if "aggregation" in metric_config:
+                    agg_name = metric_config["aggregation"]
+                    if isinstance(agg_name, str):
+                        self._aggregation_list[metric_name] = get_aggregation(agg_name)
+                    elif callable(agg_name):  # noqa: E721
+                        self._aggregation_list[metric_name] = metric_config[
+                            "aggregation"
+                        ]
+                else:
+                    INV_AGG_REGISTRY = {v: k for k, v in AGGREGATION_REGISTRY.items()}
+                    metric_agg = get_metric_aggregation(metric_name)
+                    eval_logger.warning(
+                        f"[Task: {self.config.task}] metric {metric_name} is defined, but aggregation is not. "
+                        f"using default "
+                        f"aggregation={INV_AGG_REGISTRY[metric_agg]}"
+                    )
+                    self._aggregation_list[metric_name] = metric_agg
+                if "higher_is_better" in metric_config:
+                    self._higher_is_better[metric_name] = metric_config[
+                        "higher_is_better"
+                    ]
+                else:
+                    eval_logger.warning(
+                        f"[Task: {self.config.task}] metric {metric_name} is defined, but higher_is_better is not. "
+                        f"using default "
+                        f"higher_is_better={is_higher_better(metric_name)}"
+                    )
+                    self._higher_is_better[metric_name] = is_higher_better(metric_name)
+        self.download(self.config.dataset_kwargs)
+        self._training_docs = None
+        self._fewshot_docs = None
+        if self.config.filter_list is not None:
+            self._filters = []
+            for filter_config in self.config.filter_list:
+                filter_name = filter_config["name"]
+                filter_functions = filter_config["filter"]
+                components = []
+                for function in filter_functions:
+                    kwargs = {
+                        key: function[key] for key in function if key != "function"
+                    }
+                    components.append([function["function"], kwargs])
+                filter_pipeline = build_filter_ensemble(filter_name, components)
+                self._filters.append(filter_pipeline)
+        else:
+            # TODO: handle repeats in a more general way rather than just discarding
+            eval_logger.debug(
+                "No custom filters defined. Using default 'take_first' filter for handling repeats."
+            )
+            self._filters = [build_filter_ensemble("none", [["take_first", None]])]
+        if self.config.use_prompt is not None:
+            eval_logger.info(f"loading prompt {self.config.use_prompt}")
+            self.prompt = get_prompt(
+                self.config.use_prompt, self.DATASET_PATH, self.DATASET_NAME
+            )
+        else:
+            self.prompt = None
+        if self.fewshot_docs() is not None:
+            self.fewshot_rnd = (
+                random.Random()
+            )  # setting with no seed, to be overridden at a later time
+            config_sampler: Union[str, Callable] = (
+                self.config.fewshot_config.get("sampler", "default")
+                if self.config.fewshot_config
+                else "default"
+            )
+            if isinstance(config_sampler, str):
+                self.sampler = samplers.get_sampler(config_sampler)(
+                    list(self.fewshot_docs()), self, rnd=self.fewshot_rnd
+                )
+            elif callable(config_sampler) and issubclass(
+                config_sampler, samplers.ContextSampler
+            ):
+                self.sampler = config_sampler(
+                    docs=list(self.fewshot_docs()), task=self, rnd=self.fewshot_rnd
+                )
+            else:
+                raise TypeError(
+                    f"fewshot_config.sampler should be a string or callable of ContextSampler type, "
+                    f"not {type(config_sampler)}"
+                )
+        self.task_docs = self.eval_docs
+        # Test One Doc
+        self.features = list(self.task_docs.features.keys())
+        self.multiple_input = 0
+        self.multiple_target = 0
+        test_doc = self.task_docs[0]
+        test_text = self.doc_to_text(test_doc)
+        test_target = self.doc_to_target(test_doc)
+        if self.config.doc_to_choice is not None:
+            test_choice = self.doc_to_choice(test_doc)
+            if not isinstance(test_choice, list):
+                eval_logger.error("doc_to_choice must return list")
+            else:
+                num_choice = len(test_choice)
+            if isinstance(test_text, int):
+                eval_logger.debug(
+                    "doc_to_text returned an int. Assuming multiple inputs."
+                )
+                self.multiple_input = num_choice
+        else:
+            test_choice = None
+        if isinstance(test_target, list):
+            eval_logger.debug(
+                "doc_to_target returned a list. Assuming multiple targets."
+            )
+            self.multiple_target = len(test_target)
+        else:
+            if (isinstance(test_target, int)) and (test_choice is not None):
+                test_target = test_choice[test_target]
+            else:
+                test_target = str(test_target)
+        if test_choice is not None:
+            check_choices = test_choice
+        else:
+            check_choices = [test_target]
+        if self.config.doc_to_choice is not None:
+            for choice in check_choices:
+                choice_has_whitespace = True if choice[0].isspace() else False
+                delimiter_has_whitespace = (
+                    True
+                    if self.config.target_delimiter.rstrip()
+                    != self.config.target_delimiter
+                    else False
+                )
+                if delimiter_has_whitespace and choice_has_whitespace:
+                    eval_logger.debug(
+                        f'Both target_delimiter "{self.config.target_delimiter}" and target choice: "{choice}" have whitespace'
+                    )
+                elif (not delimiter_has_whitespace) and (not choice_has_whitespace):
+                    eval_logger.debug(
+                        f'Both target_delimiter "{self.config.target_delimiter}" and target choice: "{choice}" do not have whitespace, ignore if the language you are evaluating on does not require/use whitespace'
+                    )
+    def download(
+        self, dataset_kwargs: Optional[Dict[str, Any]] = None, **kwargs
+    ) -> None:
+        if isinstance(self.config.custom_dataset, Callable):
+            eval_logger.warning(
+                f"{self.config.task}: Custom kwargs can be passed to `--metadata` in console (as json string) or to the TaskManager."
+                + "\nFor example --metadata='{\"max_seq_lengths\":[4096, 8192]}'. For details see task Readme."
+            )
+            self.dataset = self.config.custom_dataset(
+                **(self.config.metadata or {}), **(self.config.dataset_kwargs or {})
+            )
+        else:
+            self.dataset = datasets.load_dataset(
+                path=self.DATASET_PATH,
+                name=self.DATASET_NAME,
+                **dataset_kwargs if dataset_kwargs is not None else {},
+            )
+    def has_training_docs(self) -> bool:
+        if self.config.training_split is not None:
+            return True
+        else:
+            return False
+    def has_validation_docs(self) -> bool:
+        if self.config.validation_split is not None:
+            return True
+        else:
+            return False
+    def has_test_docs(self) -> bool:
+        if self.config.test_split is not None:
+            return True
+        else:
+            return False
+    def training_docs(self) -> datasets.Dataset:
+        if self.has_training_docs():
+            if self.config.process_docs is not None:
+                return self.config.process_docs(
+                    self.dataset[self.config.training_split]
+                )
+            return self.dataset[self.config.training_split]
+    def validation_docs(self) -> datasets.Dataset:
+        if self.has_validation_docs():
+            if self.config.process_docs is not None:
+                return self.config.process_docs(
+                    self.dataset[self.config.validation_split]
+                )
+            return self.dataset[self.config.validation_split]
+    def test_docs(self) -> datasets.Dataset:
+        if self.has_test_docs():
+            if self.config.process_docs is not None:
+                return self.config.process_docs(self.dataset[self.config.test_split])
+            return self.dataset[self.config.test_split]
+    def fewshot_docs(self):
+        if self.config.fewshot_split is not None:
+            if self.config.process_docs is not None:
+                return self.config.process_docs(self.dataset[self.config.fewshot_split])
+            return self.dataset[self.config.fewshot_split]
+        elif (
+            self.config.fewshot_config is not None
+            and self.config.fewshot_config.get("samples", None) is not None
+        ):
+            if isinstance(self.config.fewshot_config["samples"], list):
+                return self.config.fewshot_config["samples"]
+            elif callable(self.config.fewshot_config["samples"]):
+                return self.config.fewshot_config["samples"]()
+            else:
+                raise Exception(
+                    "`fewshot_config['samples']` was incorrectly defined in the configuration. It should be either a list of samples as a dict, or function returning this list."
+                )
+        else:
+            if (self.config.num_fewshot is not None) and (self.config.num_fewshot > 0):
+                eval_logger.warning(
+                    f"[Task: {self.config.task}] "
+                    "num_fewshot > 0 but fewshot_split is None. "
+                    "using preconfigured rule."
+                )
+            return super().fewshot_docs()
+    @staticmethod
+    def append_target_question(
+        labeled_examples: List[Dict[str, str]],
+        question: str,
+        fewshot_as_multiturn: bool = False,
+        gen_prefix: Optional[str] = None,
+    ) -> None:
+        """Adds a target question to the labeled examples list.
+        If fewshot_as_multiturn is True, or labeled_examples is empty, or the last entry is a system turn, appends the question as a new user entry.
+        Otherwise, it is appended to the last user entry, ensuring that the conversation alternates between the user and the assistant.
+        """
+        if not fewshot_as_multiturn:
+            # if no messages or last message is system, append as new user entry
+            if len(labeled_examples) == 0 or labeled_examples[-1]["role"] == "system":
+                labeled_examples.append({"role": "user", "content": question})
+            # if last message is user, append to it to avoid two user messages in a row
+            else:
+                labeled_examples[-1]["content"] += question
+        else:
+            # if fewshot_as_multiturn is True, append as next user entry (last is always assistant)
+            labeled_examples.append({"role": "user", "content": question})
+        if gen_prefix:
+            labeled_examples.append({"role": "assistant", "content": gen_prefix})
+    @utils.positional_deprecated
+    def fewshot_context(
+        self,
+        doc: dict,
+        num_fewshot: int,
+        system_instruction: Optional[str] = None,
+        apply_chat_template: bool = False,
+        fewshot_as_multiturn: bool = False,
+        chat_template: Optional[Callable] = None,
+        gen_prefix: Optional[str] = None,
+    ) -> Union[str, List[str]]:
+        """Returns a fewshot context string that is made up of a prepended description
+        (if provided), the `num_fewshot` number of examples, and an appended prompt example.
+        :param doc: str
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param num_fewshot: int
+            The number of fewshot examples to provide in the returned context string.
+        :param  system_instruction: str
+            System instruction to be applied to the prompt.
+        :param apply_chat_template: bool
+            Whether to apply the chat template to the fewshot context.
+        :param fewshot_as_multiturn: bool
+            Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
+        :param chat_template:
+            callable (from lm.apply_chat_template) that takes in a list[Dict] chat transcript and renders it into a string.
+        :param gen_prefix:
+            String to append after the <|assistant|> token.
+        :returns: str
+            The fewshot context.
+        """
+        if apply_chat_template:
+            labeled_examples = []
+        else:
+            labeled_examples = ""
+        # get task description
+        if description := self.config.description:
+            description = utils.apply_template(self.config.description, doc)
+        # create system prompt based on the provided system instruction and description
+        if system_instruction is not None and description:
+            system_prompt = (
+                f"{system_instruction}{self.sampler.fewshot_delimiter}{description}"
+            )
+        elif system_instruction is not None:
+            system_prompt = system_instruction
+        elif description:
+            system_prompt = description
+        else:
+            system_prompt = ""
+        # add system prompt if specified
+        if system_prompt:
+            if apply_chat_template:
+                labeled_examples.append({"role": "system", "content": system_prompt})
+            else:
+                labeled_examples = system_prompt
+        # if few-shot - append examples after the system prompt
+        if num_fewshot > 0:
+            if apply_chat_template:
+                labeled_examples.extend(
+                    self.sampler.get_chat_context(
+                        doc,
+                        num_fewshot,
+                        fewshot_as_multiturn,
+                        gen_prefix=gen_prefix,
+                    )
+                )
+            else:
+                labeled_examples += self.sampler.get_context(
+                    doc, num_fewshot, gen_prefix=gen_prefix
+                )
+        example = self.doc_to_text(doc)
+        if apply_chat_template:
+            if self.multiple_input:
+                # TODO: append prefill?
+                if not labeled_examples:
+                    return ""
+                return chat_template(labeled_examples)
+            if isinstance(example, str):
+                self.append_target_question(
+                    labeled_examples,
+                    example,
+                    fewshot_as_multiturn,
+                    gen_prefix=gen_prefix,
+                )
+            # for loglikelihood create a list of questions with appended choices
+            elif isinstance(example, list):
+                labeled_examples_list = []
+                # copy chat history for each example and append the answer
+                for ex in example:
+                    chat = deepcopy(labeled_examples)
+                    self.append_target_question(
+                        chat,
+                        ex,
+                        fewshot_as_multiturn,
+                        gen_prefix=gen_prefix,
+                    )
+                    # TODO: append prefill?
+                    labeled_examples_list.append(
+                        chat_template(
+                            chat,
+                            add_generation_prompt=False if gen_prefix else True,
+                        )
+                    )
+                return labeled_examples_list
+            # if example is an integer, append the choice or convert to string
+            elif isinstance(example, int):
+                if self.config.doc_to_choice is not None:
+                    choices = self.doc_to_choice(doc)
+                    self.append_target_question(
+                        labeled_examples,
+                        choices[example],
+                        fewshot_as_multiturn,
+                        gen_prefix=gen_prefix,
+                    )
+                else:
+                    self.append_target_question(
+                        labeled_examples,
+                        str(example),
+                        fewshot_as_multiturn,
+                        gen_prefix=gen_prefix,
+                    )
+                # return lm.apply_chat_template(labeled_examples)
+            return chat_template(
+                labeled_examples,
+                add_generation_prompt=False if gen_prefix else True,
+            )
+        else:
+            prefix = (
+                self.config.target_delimiter + gen_prefix
+                if gen_prefix is not None
+                else ""
+            )
+            if self.multiple_input:
+                return labeled_examples
+            if isinstance(example, str):
+                return labeled_examples + example + prefix
+            elif isinstance(example, list):
+                return [labeled_examples + ex + prefix for ex in example]
+            elif isinstance(example, int):
+                if self.config.doc_to_choice is not None:
+                    choices = self.doc_to_choice(doc)
+                    return labeled_examples + choices[example] + prefix
+                else:
+                    return labeled_examples + str(example) + prefix
+    def apply_filters(self) -> Optional[List[Instance]]:
+        """Iterates over FilterEnsembles and applies them to instances"""
+        if hasattr(self, "_filters"):
+            for f in self._filters:
+                f.apply(self._instances)
+        else:
+            eval_logger.warning("No filter defined, passing through instances")
+            return self._instances
+    def should_decontaminate(self):
+        return self.config.should_decontaminate
+    def doc_to_decontamination_query(self, doc: dict):
+        if self.config.should_decontaminate:
+            if self.config.doc_to_decontamination_query is None:
+                return self.doc_to_text(doc)
+            else:
+                doc_to_decontamination_query = self.config.doc_to_decontamination_query
+                if doc_to_decontamination_query in self.features:
+                    return doc[doc_to_decontamination_query]
+                elif callable(doc_to_decontamination_query):
+                    return doc_to_decontamination_query(doc)
+                else:
+                    return ast.literal_eval(
+                        utils.apply_template(
+                            self.config.doc_to_decontamination_query, doc
+                        )
+                    )
+    def _process_doc(self, doc: dict) -> dict:
+        """
+        Override this to process (detokenize, strip, replace, etc.) individual
+        documents. This can be used in a map over documents of a data split.
+        E.g. `map(self._process_doc, self.dataset["validation"])`
+        :return: dict
+            The processed version of the specified `doc`.
+        """
+        return doc
+    def doc_to_text(self, doc, doc_to_text=None):
+        if self.prompt is not None:
+            doc_to_text = self.prompt
+        elif doc_to_text is not None:
+            doc_to_text = doc_to_text
+        else:
+            doc_to_text = self.config.doc_to_text
+        if isinstance(doc_to_text, int):
+            return doc_to_text
+        elif isinstance(doc_to_text, str):
+            if doc_to_text in self.features:
+                # if self.config.doc_to_choice is not None:
+                #     return self.doc_to_choice(doc)[doc[doc_to_text]]
+                # else:
+                return doc[doc_to_text]
+            else:
+                text_string = utils.apply_template(doc_to_text, doc)
+                if text_string.isdigit() and self._config.doc_to_choice is not None:
+                    return ast.literal_eval(text_string)
+                else:
+                    return text_string
+        elif callable(doc_to_text):
+            return doc_to_text(doc)
+        # Used when applying a Promptsource template
+        elif hasattr(doc_to_text, "apply"):
+            applied_prompt = doc_to_text.apply(doc)
+            if len(applied_prompt) == 2:
+                return applied_prompt[0]
+            else:
+                eval_logger.warning("Applied prompt returns empty string")
+                return self.config.fewshot_delimiter
+        else:
+            print(type(doc_to_text))
+            raise TypeError
+    def doc_to_target(self, doc: Mapping, doc_to_target=None) -> Union[int, str, list]:
+        if self.prompt is not None:
+            doc_to_target = self.prompt
+        elif doc_to_target is not None:
+            doc_to_target = doc_to_target
+        else:
+            doc_to_target = self.config.doc_to_target
+        if isinstance(doc_to_target, int):
+            return doc_to_target
+        elif isinstance(doc_to_target, str):
+            if doc_to_target in self.features:
+                # if self.config.doc_to_choice is not None:
+                #     return self.doc_to_choice(doc)[doc[doc_to_target]]
+                # else:
+                return doc[doc_to_target]
+            else:
+                target_string = utils.apply_template(doc_to_target, doc)
+                if target_string.isdigit() and self._config.doc_to_choice is not None:
+                    return ast.literal_eval(target_string)
+                elif (
+                    len(target_string) >= 2
+                    and (target_string[0] == "[")
+                    and (target_string[-1] == "]")
+                ):
+                    try:
+                        return ast.literal_eval(target_string)
+                    except (SyntaxError, ValueError):
+                        return target_string
+                else:
+                    return target_string
+        elif isinstance(doc_to_target, list):
+            return doc_to_target
+        elif callable(doc_to_target):
+            return doc_to_target(doc)
+        # Used when applying a Promptsource template
+        elif hasattr(doc_to_target, "apply"):
+            applied_prompt = doc_to_target.apply(doc)
+            if len(applied_prompt) == 2:
+                return applied_prompt[1]
+            else:
+                eval_logger.warning("Applied prompt returns empty string")
+                return self.config.fewshot_delimiter
+        else:
+            raise TypeError
+    def doc_to_choice(self, doc: Any, doc_to_choice=None) -> List[str]:
+        if self.prompt is not None:
+            doc_to_choice = self.prompt
+        elif doc_to_choice is not None:
+            doc_to_choice = doc_to_choice
+        elif self.config.doc_to_choice is None:
+            eval_logger.error("doc_to_choice was called but not set in config")
+        else:
+            doc_to_choice = self.config.doc_to_choice
+        if isinstance(doc_to_choice, str):
+            if doc_to_choice in self.features:
+                return doc[doc_to_choice]
+            else:
+                return ast.literal_eval(utils.apply_template(doc_to_choice, doc))
+        elif isinstance(doc_to_choice, list):
+            return doc_to_choice
+        elif isinstance(doc_to_choice, dict):
+            return list(doc_to_choice.values())
+        elif callable(doc_to_choice):
+            return doc_to_choice(doc)
+        elif hasattr(doc_to_choice, "get_answer_choices_list"):
+            return doc_to_choice.get_answer_choices_list(doc)
+        else:
+            raise TypeError
+    def doc_to_image(self, doc: Any, doc_to_image=None) -> Union[int, str, list]:
+        if doc_to_image is not None:
+            doc_to_image = doc_to_image
+        elif self.config.doc_to_image is not None:
+            doc_to_image = self.config.doc_to_image
+        else:
+            return None
+        if isinstance(doc_to_image, list):
+            image_feature = [
+                self.doc_to_image(doc, feature) for feature in doc_to_image
+            ]
+            return [feature for feature in image_feature if feature is not None]
+        elif isinstance(doc_to_image, str):
+            if doc_to_image in self.features:
+                return doc[doc_to_image]
+            else:
+                return ast.literal_eval(utils.apply_template(doc_to_image, doc))
+        elif callable(doc_to_image):
+            return doc_to_image(doc)
+        else:
+            return None
+    def doc_to_audio(self, doc: Any, doc_to_audio=None) -> Union[int, str, list]:
+        if doc_to_audio is not None:
+            doc_to_audio = doc_to_audio
+        elif self.config.doc_to_audio is not None:
+            doc_to_audio = self.config.doc_to_audio
+        else:
+            return None
+        if isinstance(doc_to_audio, list):
+            audio_feature = [
+                self.doc_to_audio(doc, feature) for feature in doc_to_audio
+            ]
+            return [feature for feature in audio_feature if feature is not None]
+        elif isinstance(doc_to_audio, str):
+            if doc_to_audio in self.features:
+                return doc[doc_to_audio]
+            else:
+                return ast.literal_eval(utils.apply_template(doc_to_audio, doc))
+        elif callable(doc_to_audio):
+            return doc_to_audio(doc)
+        else:
+            return None
+    def doc_to_prefix(self, doc):
+        if (gen_prefix := self.config.gen_prefix) is not None:
+            if gen_prefix in self.features:
+                return doc[gen_prefix]
+            else:
+                return utils.apply_template(gen_prefix, doc)
+        return None
+    def construct_requests(
+        self, doc: dict, ctx: str, **kwargs
+    ) -> Union[List[Instance], Instance]:
+        apply_chat_template = kwargs.pop("apply_chat_template", False)
+        chat_template: Callable | None = kwargs.pop("chat_template", None)
+        aux_arguments = None
+        if self.OUTPUT_TYPE == "loglikelihood":
+            arguments = (ctx, self.doc_to_target(doc))
+        elif self.OUTPUT_TYPE == "loglikelihood_rolling":
+            arguments = (self.doc_to_target(doc),)
+        elif self.OUTPUT_TYPE == "multiple_choice":
+            choices = self.doc_to_choice(doc)
+            target_delimiter = self.config.target_delimiter
+            if apply_chat_template:
+                target_delimiter = ""
+            if self.multiple_input:
+                # If there are multiple inputs, choices are placed in the ctx
+                # apply chat_template to choices if apply_chat_template
+                cont = self.doc_to_target(doc)
+                arguments = [
+                    (
+                        ctx
+                        + (
+                            chat_template([{"role": "user", "content": choice}])
+                            if apply_chat_template
+                            else choice
+                        ),
+                        f"{target_delimiter}{cont}",
+                    )
+                    for choice in choices
+                ]
+            else:
+                # Otherwise they are placed in the continuation
+                arguments = [(ctx, f"{target_delimiter}{cont}") for cont in choices]
+            # TODO: we should raise a warning telling users this will at most ~2x runtime.
+            if "acc_mutual_info" in self._metric_fn_list.keys():
+                # if we are calculating multiple choice accuracy
+                # using mutual information instead of raw loglikelihood as metric, need unconditional lls.
+                # here mutual info refers to calculating
+                # log(P(choice|ctx) / P(choice)) = log(P(choice|ctx)) - log(P(choice))
+                # in other words normalizing by subtracting the unconditional logprob of each choice.
+                # TODO: should these be strided? will have to modify the processing in process_results if so
+                aux_arguments = [
+                    ("", f"{target_delimiter}{choice}") for choice in choices
+                ]
+                arguments.extend(aux_arguments)
+        elif self.OUTPUT_TYPE == "generate_until":
+            arguments = (ctx, deepcopy(self.config.generation_kwargs))
+        multimodal_arg = {}
+        if (
+            self.config.doc_to_image
+        ):  # TODO: ensure that non-multimodal tasks aren't getting visual args
+            multimodal_arg = {
+                **multimodal_arg,
+                **{"visual": self.doc_to_image(doc)},
+            }
+        if (
+            self.config.doc_to_audio
+        ):  # TODO: ensure that non-multimodal tasks aren't getting audio args
+            multimodal_arg = {
+                **multimodal_arg,
+                **{"audio": self.doc_to_audio(doc)},
+            }
+        if bool(multimodal_arg):
+            if isinstance(arguments, list):
+                arguments = [arg + (multimodal_arg,) for arg in arguments]
+            else:
+                arguments = arguments + (multimodal_arg,)
+        if self.OUTPUT_TYPE == "multiple_choice":
+            request_list = [
+                Instance(
+                    request_type="loglikelihood",
+                    doc=doc,
+                    arguments=arg,
+                    idx=i,
+                    **kwargs,
+                )
+                for i, arg in enumerate(arguments)
+            ]
+            return request_list
+        return Instance(
+            request_type=self.OUTPUT_TYPE,
+            doc=doc,
+            arguments=arguments,
+            idx=0,
+            **kwargs,
+        )
+    def process_results(self, doc, results):
+        if callable(self.config.process_results):
+            return self.config.process_results(doc, results)
+        result_dict = {}
+        use_metric = list(self._metric_fn_list.keys())
+        if self.OUTPUT_TYPE == "loglikelihood":
+            results = results[0]
+            ll, is_greedy = results
+            return {
+                **({"perplexity": ll} if "perplexity" in use_metric else {}),
+                **({"acc": int(is_greedy)} if "acc" in use_metric else {}),
+            }
+        elif self.OUTPUT_TYPE == "loglikelihood_rolling":
+            (loglikelihood,) = results
+            _words = self.count_words(self.doc_to_target(doc))
+            _bytes = self.count_bytes(self.doc_to_target(doc))
+            return {
+                **(
+                    {"word_perplexity": (loglikelihood, _words)}
+                    if "word_perplexity" in use_metric
+                    else {}
+                ),
+                **(
+                    {"byte_perplexity": (loglikelihood, _bytes)}
+                    if "byte_perplexity" in use_metric
+                    else {}
+                ),
+                **(
+                    {"bits_per_byte": (loglikelihood, _bytes)}
+                    if "bits_per_byte" in use_metric
+                    else {}
+                ),
+            }
+        elif self.OUTPUT_TYPE == "multiple_choice":
+            lls, is_greedy = zip(*results)
+            # retrieve choices in List[str] form, to compute choice lengths, etc.
+            choices = self.doc_to_choice(doc)
+            completion_len = np.array([float(len(i)) for i in choices])
+            if (
+                2 * len(choices) == len(lls)
+                and "acc_mutual_info" in self._metric_fn_list.keys()
+            ):
+                # then we are doing mutual info.
+                # this stores the "dryrun" / unconditional answer loglikelihoods
+                # as we extend the args list with unconditional ("", continuation) pairs
+                lls_unconditional = lls[len(choices) :]
+                if len(lls_unconditional) != len(choices):
+                    raise ValueError
+                # and this stores our "regular" conditional loglikelihoods
+                lls = lls[: len(choices)]
+            pred = np.argmax(lls)
+            pred_norm = np.argmax(lls / completion_len)
+            if self.multiple_input:
+                gold = self.doc_to_text(doc)
+            else:
+                gold = self.doc_to_target(doc)
+            gold_index_error = False
+            if isinstance(gold, list):
+                gold = [i if i < len(choices) else -100 for i in gold]
+                if -100 in gold:
+                    gold_index_error = True
+            else:
+                if isinstance(gold, int):
+                    gold = gold if gold < len(choices) else -100
+                elif isinstance(gold, str):
+                    gold = choices.index(gold) if gold in choices else -100
+                if gold == -100:
+                    gold_index_error = True
+            if gold_index_error:
+                eval_logger.warning(
+                    f"Label index was not in within range of available choices,"
+                    f"Sample:\n\n{doc}\n\n"
+                )
+            if self.multiple_target:
+                acc = 1.0 if pred in gold else 0.0
+                acc_norm = 1.0 if pred_norm in gold else 0.0
+                exact_match = int(any([is_greedy[i] if i != -100 else 0 for i in gold]))
+            else:
+                acc = 1.0 if pred == gold else 0.0
+                acc_norm = 1.0 if pred_norm == gold else 0.0
+                # TODO: this gets score of 0 on arc_challenge for pythia-70m. need to test that this works properly
+                exact_match = int(is_greedy[gold]) if gold != -100 else 0
+            prob_norm = utils.softmax(lls)
+            # TODO use keyword arguments to the metric?
+            # gold, pred, norm stuff, the original lls,
+            result_dict = {
+                **({"acc": acc} if "acc" in use_metric else {}),
+                **({"f1": (gold, pred)} if "f1" in use_metric else {}),
+                **({"mcc": (gold, pred)} if "mcc" in use_metric else {}),
+                **({"acc_norm": acc_norm} if "acc_norm" in use_metric else {}),
+                **({"exact_match": exact_match} if "exact_match" in use_metric else {}),
+                **(
+                    {"brier_score": (gold, prob_norm)}
+                    if "brier_score" in use_metric
+                    else {}
+                ),
+            }
+            if "acc_mutual_info" in use_metric:
+                lls_mutual_info = [
+                    ll_c - ll_u for ll_c, ll_u in zip(lls, lls_unconditional)
+                ]
+                acc_mutual_info = 1.0 if np.argmax(lls_mutual_info) == gold else 0.0
+                result_dict["acc_mutual_info"] = acc_mutual_info
+        elif self.OUTPUT_TYPE == "generate_until":
+            gold = self.doc_to_target(doc)
+            result = results[0]
+            if self.config.doc_to_choice is not None:
+                # If you set doc_to_choice,
+                # it assumes that doc_to_target returns a number.
+                choices = self.doc_to_choice(doc)
+                gold = choices[gold]
+            # we expect multiple_targets to be a list.
+            elif self.multiple_target:
+                gold = list(gold)
+            # TODO: handle this better
+            elif type(gold) is not type(result) and not (
+                "bypass" in self._metric_fn_list.keys() or isinstance(result, list)
+            ):
+                # cast gold to the same type as result
+                gold = type(result)(gold)
+            for metric in self._metric_fn_list.keys():
+                if self.multiple_target:
+                    # in the case where we have multiple targets,
+                    # return true if any are true
+                    # TODO: this may break for multipLe_target, non zero-or-1 metrics
+                    scores = []
+                    if not isinstance(gold, list):
+                        # sometimes, a multiple_target dataset has exceptions where one doc has only one string answer
+                        # print(gold)
+                        gold = [gold]
+                    if metric == "exact_match":
+                        result = [result for _ in range(len(gold))]
+                        scores = self._metric_fn_list[metric](
+                            references=gold,
+                            predictions=result,
+                            **self._metric_fn_kwargs[metric],
+                        )[metric]
+                        result_score = 1.0 if scores > 0.0 else 0.0
+                    else:
+                        for gold_option in gold:
+                            try:
+                                result_score = self._metric_fn_list[metric](
+                                    references=[gold_option],
+                                    predictions=[result],
+                                    **self._metric_fn_kwargs[metric],
+                                )
+                            except (
+                                TypeError
+                            ):  # TODO: this is hacky and I don't want to do it
+                                result_score = self._metric_fn_list[metric](
+                                    [gold_option, result]
+                                )
+                            if isinstance(result_score, dict):
+                                # TODO: this handles the case where HF evaluate returns a dict.
+                                result_score = result_score[metric]
+                            scores.append(result_score)
+                        if any(scores):
+                            result_score = 1.0
+                        else:
+                            result_score = 0.0
+                else:
+                    try:
+                        result_score = self._metric_fn_list[metric](
+                            references=[gold],
+                            predictions=[result],
+                            **self._metric_fn_kwargs[metric],
+                        )
+                    except TypeError:  # needed for now in order to use a different interface between our own metrics and HF Evaluate metrics
+                        result_score = self._metric_fn_list[metric]([gold, result])
+                if isinstance(result_score, dict):
+                    # TODO: this handles the case where HF evaluate returns a dict.
+                    # This allows for multiple metrics to be returned from the same function
+                    for k, v in result_score.items():
+                        result_dict[k] = v
+                else:
+                    result_dict[metric] = result_score
+        else:
+            raise ValueError(
+                f"Passed invalid output_type '{self.OUTPUT_TYPE}' ! Please use one of ",
+                "'loglikelihood', 'loglikelihood_rolling', 'generate_until' or 'multiple_choice'",
+            )
+        return result_dict
+    def aggregation(self) -> dict:
+        return self._aggregation_list
+    def higher_is_better(self) -> dict:
+        return self._higher_is_better
+    def get_config(self, key: str) -> Any:
+        return getattr(self._config, key, None)
+    @property
+    def task_name(self) -> Any:
+        return getattr(self.config, "task", None)
+    def __repr__(self):
+        return (
+            f"ConfigurableTask(task_name={getattr(self.config, 'task', None)},"
+            f"output_type={self.OUTPUT_TYPE},"
+            f"num_fewshot={getattr(self.config, 'num_fewshot', None)},"
+            f"num_samples={len(self.eval_docs)})"
+        )
+class MultipleChoiceTask(Task):
+    OUTPUT_TYPE = "loglikelihood"
+    def doc_to_target(self, doc: dict) -> str:
+        return " " + doc["choices"][doc["gold"]]
+    def construct_requests(self, doc: dict, ctx: str, **kwargs) -> List[Instance]:
+        # TODO: add mutual info here?
+        return [
+            Instance(
+                request_type="loglikelihood",
+                doc=doc,
+                arguments=(ctx, " {}".format(choice)),
+                idx=i,
+                **kwargs,
+            )
+            for i, choice in enumerate(doc["choices"])
+        ]
+    def process_results(self, doc: dict, results: Iterable[Tuple[float, bool]]) -> dict:
+        results = [
+            res[0] for res in results
+        ]  # only retain loglikelihoods, discard is_greedy TODO: do we need is_greedy anywhere?
+        gold = doc["gold"]
+        acc = 1.0 if np.argmax(results) == gold else 0.0
+        completion_len = np.array([float(len(i)) for i in doc["choices"]])
+        acc_norm = 1.0 if np.argmax(results / completion_len) == gold else 0.0
+        return {
+            "acc": acc,
+            "acc_norm": acc_norm,
+        }
+    def higher_is_better(self) -> dict:
+        return {
+            "acc": True,
+            "acc_norm": True,
+        }
+    def aggregation(self) -> dict:
+        return {
+            "acc": mean,
+            "acc_norm": mean,
+        }
+class PerplexityTask(Task):
+    OUTPUT_TYPE = "loglikelihood_rolling"
+    def has_training_docs(self) -> bool:
+        return False
+    def fewshot_examples(self, k: int, rnd) -> List:
+        if k != 0:
+            raise ValueError(
+                "The number of fewshot examples must be 0 for perplexity tasks."
+            )
+        return []
+    def fewshot_context(self, doc: dict, num_fewshot: int) -> Literal[""]:
+        if num_fewshot != 0:
+            raise ValueError(
+                "The number of fewshot examples must be 0 for perplexity tasks."
+            )
+        return ""
+    def higher_is_better(self) -> dict:
+        return {
+            "word_perplexity": False,
+            "byte_perplexity": False,
+            "bits_per_byte": False,
+        }
+    def doc_to_decontamination_query(self, doc):
+        return doc
+    def doc_to_text(self, doc) -> str:
+        return ""
+    def doc_to_target(self, doc):
+        return doc
+    def construct_requests(self, doc: dict, ctx: Optional[str], **kwargs):
+        if bool(ctx):
+            raise ValueError
+        return Instance(
+            request_type=self.OUTPUT_TYPE,
+            doc=doc,
+            arguments=(self.doc_to_target(doc),),
+            idx=0,
+            **kwargs,
+        )
+    def process_results(self, doc: dict, results: Tuple[float]) -> dict:
+        (loglikelihood,) = results
+        words = self.count_words(self.doc_to_target(doc))
+        bytes_ = self.count_bytes(self.doc_to_target(doc))
+        return {
+            "word_perplexity": (loglikelihood, words),
+            "byte_perplexity": (loglikelihood, bytes_),
+            "bits_per_byte": (loglikelihood, bytes_),
+        }
+    def aggregation(self) -> dict:
+        return {
+            "word_perplexity": weighted_perplexity,
+            "byte_perplexity": weighted_perplexity,
+            "bits_per_byte": bits_per_byte,
+        }
+    @classmethod
+    def count_bytes(cls, doc) -> int:
+        return len(doc.encode("utf-8"))
+    @classmethod
+    def count_words(cls, doc) -> int:
+        """Downstream tasks with custom word boundaries should override this!"""
+        return len(re.split(r"\s+", doc))

Prism/LLaDA/LLaDA_Baseline/dllm_eval/caching/__init__.py ADDED Viewed

File without changes

Prism/LLaDA/LLaDA_Baseline/dllm_eval/caching/cache.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import hashlib
+import logging
+import os
+import dill
+eval_logger = logging.getLogger(__name__)
+MODULE_DIR = os.path.dirname(os.path.realpath(__file__))
+OVERRIDE_PATH = os.getenv("LM_HARNESS_CACHE_PATH")
+PATH = OVERRIDE_PATH if OVERRIDE_PATH else f"{MODULE_DIR}/.cache"
+# This should be sufficient for uniqueness
+HASH_INPUT = "EleutherAI-lm-evaluation-harness"
+HASH_PREFIX = hashlib.sha256(HASH_INPUT.encode("utf-8")).hexdigest()
+FILE_SUFFIX = f".{HASH_PREFIX}.pickle"
+def load_from_cache(file_name: str, cache: bool = False):
+    if not cache:
+        return
+    try:
+        path = f"{PATH}/{file_name}{FILE_SUFFIX}"
+        with open(path, "rb") as file:
+            cached_task_dict = dill.loads(file.read())
+            return cached_task_dict
+    except Exception:
+        eval_logger.debug(f"{file_name} is not cached, generating...")
+        pass
+def save_to_cache(file_name, obj):
+    if not os.path.exists(PATH):
+        os.mkdir(PATH)
+    file_path = f"{PATH}/{file_name}{FILE_SUFFIX}"
+    eval_logger.debug(f"Saving {file_path} to cache...")
+    with open(file_path, "wb") as file:
+        file.write(dill.dumps(obj))
+# NOTE the "key" param is to allow for flexibility
+def delete_cache(key: str = ""):
+    files = os.listdir(PATH)
+    for file in files:
+        if file.startswith(key) and file.endswith(FILE_SUFFIX):
+            file_path = f"{PATH}/{file}"
+            os.unlink(file_path)

Prism/LLaDA/LLaDA_Baseline/dllm_eval/decontamination/__init__.py ADDED Viewed

File without changes

Prism/LLaDA/LLaDA_Baseline/dllm_eval/decontamination/janitor.py ADDED Viewed

	@@ -0,0 +1,328 @@

+import pickle
+import re
+import string
+import traceback
+from typing import Iterator, List, Sequence, Tuple, TypeVar
+# This is a cpp module. Compile janitor_util.cpp with:
+# c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor_util.cpp -o janitor_util$(python3-config --extension-suffix) -undefined dynamic_lookup
+try:
+    import janitor_util
+    JANITOR_CPP = True
+except Exception:
+    print("WARNING: C++ module could not be loaded. Janitor running in python mode")
+    traceback.print_exc()
+    JANITOR_CPP = False
+T = TypeVar("T")
+# Implementation from nltk source
+# https://www.nltk.org/_modules/nltk/util.html
+def form_ngrams(sequence: Iterator[T], n: int) -> Iterator[Tuple[T, ...]]:
+    history = []
+    while n > 1:
+        # PEP 479, prevent RuntimeError from being raised when StopIteration bubbles out of generator
+        try:
+            next_item = next(sequence)
+        except StopIteration:
+            # no more data, terminate the generator
+            return
+        history.append(next_item)
+        n -= 1
+    for item in sequence:
+        history.append(item)
+        yield tuple(history)
+        del history[0]
+def word_ngrams(s: str, n: int) -> Iterator[str]:
+    """Splits a string into ngram words"""
+    tokens = s.split()  # not a generator :(
+    ngram_seqs = form_ngrams(iter(tokens), n)
+    return (" ".join(ngram) for ngram in ngram_seqs)
+# Does character sequences only - combined faster function to play around with later
+# def word_ngrams_indices_combined(sequence, n):
+#     current_word = ""
+#     history = []
+#     gap = False;
+#     start = 0
+#     end = 0
+#     for character in sequence:
+#         if character == " ":
+#             if not gap:
+#                 gap = True
+#                 history.append(current_word)
+#                 end += len(current_word) - 1
+#                 current_word = ""
+#                 if len(history) == n:
+#                     yield (tuple(history), start, end)
+#                     del history[0]
+#                     start = end + 1
+#                     end = start
+#         else:
+#             gap = False
+#             current_word += character
+# https://stackoverflow.com/questions/13734451/string-split-with-indices-in-python
+def split_indices(s: str) -> Iterator[Tuple[str, Tuple[int, int]]]:
+    """Splits a string on whitespaces and records the indices of each in the original string.
+    @:return generator((word, (start_idx, end_idx)), ...)
+    """
+    return ((m.group(0), (m.start(), m.end() - 1)) for m in re.finditer(r"\S+", s))
+def word_ngrams_indices(s: str, n: int) -> Iterator[Tuple[str, Tuple[int, int]]]:
+    """Splits a string into pairs of (ngram words, their start/end indices)"""
+    tokens_with_indices = split_indices(s)
+    # Generator of ngrams of (word, idx_pairs)
+    # (
+    #   [(word, (start,end)), (word, (start, end))...],
+    #   [(word, (start, end)), ...],
+    #   ...
+    # )
+    ngram_seqs_with_indices = form_ngrams(tokens_with_indices, n)
+    # Generator of pairs of word and index ngrams
+    # (
+    #   ([word, word, ...], [(start,end), (start,end), ...]),
+    #   ...
+    # )
+    ngram_indices_pairs = (
+        zip(*ngram_with_indices) for ngram_with_indices in ngram_seqs_with_indices
+    )
+    # Generator of ( (word_ngram, (start, end)), (word_ngram, start, end)), ...)
+    return (
+        (" ".join(ngram_seq), (indices[0][0], indices[-1][1]))
+        for ngram_seq, indices in ngram_indices_pairs
+    )
+class Janitor:
+    # FIXME delete_chars: Should anything else go here? Special chars?
+    def __init__(
+        self,
+        ngram_n: int = 13,
+        window_to_remove: int = 200,
+        too_dirty_cutoff: int = 10,
+        minimum_slice_length: int = 200,
+        delete_chars: str = string.punctuation,
+    ) -> None:
+        self.ngram_n = ngram_n
+        self.window_to_remove = window_to_remove
+        self.too_dirty_cutoff = too_dirty_cutoff
+        self.minimum_slice_length = minimum_slice_length
+        self.delete_chars = delete_chars
+        self.dirt_ngrams = set()
+        # If in python, we'll translate uppercase to lowercase and delete naughty characters.
+        # This is fast by python standards
+        # https://stackoverflow.com/questions/638893/what-is-the-most-efficient-way-in-python-to-convert-a-string-to-all-lowercase-st
+        self.translation_table = str.maketrans(
+            string.ascii_lowercase + string.ascii_uppercase,  # These characters
+            string.ascii_lowercase * 2,  # Become these characters
+            self.delete_chars,  # These are deleted
+        )
+    ##############
+    # I/O for saving contamination ngrams
+    ##############
+    def save_contamination_ngrams(self, filename: str) -> None:
+        with open(filename, "wb") as fp:
+            pickle.dump(filename, fp)
+    def load_contamination_ngrams(self, filename: str) -> None:
+        with open(filename, "rb") as fp:
+            self.dirt_ngrams = pickle.load(fp)
+    ##############
+    # Call these :)
+    ##############
+    def register_contaminant(self, dirt_string: str) -> None:
+        """Register a string as contamination to be removed, e.g. a test set
+        This breaks the dirt_string into ngrams to store for future cleaning"""
+        if JANITOR_CPP:
+            return self.register_contaminant_cpp(dirt_string)
+        else:
+            print("WARNING: Janitor running in python mode")
+            return self.register_contaminant_python(dirt_string)
+    def clean(self, dirty_string: str) -> List[str]:
+        """Clean a string (e.g. a training set) by removing all ngrams previously
+        registered as contaminants. Returns a list of clean chunks, or empty if
+        the string was too dirty"""
+        if JANITOR_CPP:
+            return self.clean_cpp(dirty_string)
+        else:
+            print("WARNING: Janitor running in python mode")
+            return self.clean_python(dirty_string)
+    def _split_chunks(
+        self, dirty_string: str, dirty_parts: Sequence[Tuple]
+    ) -> List[str]:
+        clean_chunks = []
+        splice_idx = 0
+        end = -1
+        for i, (ngram, start, end) in enumerate(dirty_parts):
+            if i >= self.too_dirty_cutoff:
+                return []
+            start = max(0, start - self.window_to_remove)
+            end = min(len(dirty_string), end + self.window_to_remove)
+            if start - splice_idx > self.minimum_slice_length:
+                clean_chunks.append(dirty_string[splice_idx:start])
+            splice_idx = end
+        if end < len(dirty_string) - self.minimum_slice_length:
+            clean_chunks.append(dirty_string[end + 1 :])
+        return clean_chunks
+    ##############
+    # Fast C++
+    ##############
+    def register_contaminant_cpp(self, dirt_string) -> None:
+        self.dirt_ngrams.update(
+            janitor_util.clean_ngram(dirt_string, self.delete_chars, self.ngram_n)
+        )
+    def clean_cpp(self, dirty_string: str) -> List[str]:
+        contamination_indices = janitor_util.clean_ngram_with_indices(
+            dirty_string, self.delete_chars, self.ngram_n
+        )
+        return self._split_chunks(dirty_string, contamination_indices)
+    ##############
+    # Slow python
+    ##############
+    def normalize_string(self, s: str) -> str:
+        return s.translate(self.translation_table)
+    def register_contaminant_python(self, dirt_string: str) -> None:
+        self.dirt_ngrams.update(
+            word_ngrams(self.normalize_string(dirt_string), self.ngram_n)
+        )
+    def clean_python(self, dirty_string: str) -> List[str]:
+        contamination_indices = (
+            (None, *idx_pair)
+            for dirty_ngram, idx_pair in word_ngrams_indices(dirty_string, self.ngram_n)
+            if self.normalize_string(dirty_ngram) in self.dirt_ngrams
+        )
+        return self._split_chunks(dirty_string, contamination_indices)
+##################################################################
+# Tests
+#################################################################
+# def print_cpp():
+#     source = """   ,, I'm a very !dirty,, ,,  dirty boy. Clean me daddy. \n\nhe he he hehe heh.  lastword  """ * 2
+#     for i in range(1, 10, 2):
+#         pprint(janitor_util.clean_ngram(source, string.punctuation, i))
+#         for ngram, start, end in \
+#                 janitor_util.clean_ngram_with_indices(source, string.punctuation, i):
+#             print(ngram, "\t", start, end, source[start:end].replace("\n", "\\n"))
+# def test_cpp():
+#     source = """   ,, I'm a very !dirty,, ,,  dirty boy. Clean me daddy. \n\nhe he he hehe heh.  lastword  """ * 2
+#     contaminant = "dirty boy. Clean he he"
+#     jan_python = Janitor()
+#     jan_cpp = Janitor()
+#     jan_python.register_contaminant_python(contaminant)
+#     jan_cpp.register_contaminant(contaminant)
+#     assert jan_python.dirt_ngrams == jan_cpp.dirt_ngrams, (jan_python.dirt_ngrams, jan_cpp.dirt_ngrams)
+#     assert jan_python.clean_python(source) == jan_cpp.clean(source), \
+#         (jan_python.clean_python(source), jan_cpp.clean(source))
+#     print("Passed test, python==cpp")
+# def benchmark():
+#     # Download and put in data folder: enwik8 (100 MB) from https://cs.fit.edu/~mmahoney/compression/textdata.html
+#     setup = \
+#         """
+#         with open("data/enwik8", "r") as f:
+#             data = f.read()
+#         jan = Janitor(too_dirty_cutoff=1000)
+#         jan.register_contaminant('''
+#         theories is that there is a connection between &quot;geekdom&quot; and autism.
+#         This is hinted, for instance, by a ''Wired Magazine'' article in 2001 entitled &quot;
+#         The [[Geek]] Syndrome&quot;, which is a point argued by many in the autism rights
+#         movement{{ref|Wired}}.  This article, many professionals assert, is just one example of
+#         the media's application of mental disease labels to what is actually variant normal behavior
+#         &amp;mdash;they argue that shyness, lack of athletic ability or social skills, and intellectual
+#         interests, even when they seem unusual to others, are not in themselves signs of autism or
+#         Asperger's syndrome. Others assert that it is actually the medical profession which is applying
+#         mental disease labels to children who in the past would have simply been accepted as a little
+#         different or even labeled 'gifted'. See [[clinomorphism]] for further discussion of this issue.
+#         Due to the recent publicity surrounding autism and autis
+#         ultan Al Nahyan]] granted [[Petroleum]] concessions, and oil was first found in 1958.  At first,
+#         oil money had a marginal impact.  A few lowrise concete buildings were erected, and the first
+#         paved road was completed in 1961, but Sheikh Shakbut, uncertain whether the new oil royalties
+#         would last, took a cautious approach, preferring to save the revenue rather than investing it in
+#         development.  His brother, [[Zayed bin Sultan Al Nahayan]], saw that oil wealth had the potential
+#         to transform Abu Dhabi.  The ruling Al Nahayan family decided that Sheikh Zayed should replace his
+#         brother as Ruler and carry out his vision of developing the country.  On [[August 6]], [[1966]],
+#         with the assistance of the British, Sheikh Zayed became the new ruler.  See generally, Al-Fahim, M,
+#         ''From Rags to Riches: A Story of Abu Dhabi'', Chapter Six (London Centre of Arab Studies, 1995),
+#         ISBN 1 900404 00 1. With the announcement by Britain in 1968 that it would withdraw from the
+#         Gulf area by 1971, Sheikh Zayed became the main driving force behind the formation of the
+#         [[United Arab Emirates]]. After the Emirates gained independence in 1971,
+#         ''')
+#         """
+#     n = 1
+#     print(f"Timing {n} run on 100 MB")
+#     print("Register contaminant")
+#     # print("\tPython", timeit.timeit("jan.register_contaminant_python(data)", setup=setup, globals=globals(), number=n))
+#     print("\tCpp", timeit.timeit("jan.register_contaminant(data)", setup=setup, globals=globals(), number=n))
+#     print("Clean")
+#     # print("\tPython", timeit.timeit("jan.clean_python(data)", setup=setup, globals=globals(), number=n))
+#     print("\tCpp", timeit.timeit("jan.clean(data)", setup=setup, globals=globals(), number=n))
+# def test_janitor_general():
+#     source = """   ,, I'm a very !dirty,, ,,  dirty boy. Clean me daddy. \n\nhe he he hehe heh.  lastword  """ * 2
+#     contaminant = "dirty boy. Clean he he"
+#     jan = Janitor(ngram_n=3)
+#     jan.register_contaminant(contaminant)
+#     cleaned = " ".join(jan.clean(source))
+#     for contam in jan.dirt_ngrams:
+#         assert contam not in cleaned, contam
+#     filename = "data/saved_contam"
+#     jan.save_contamination_ngrams(filename)
+#     jan = Janitor(ngram_n=3)
+#     jan.load_contamination_ngrams(filename)
+#     cleaned = " ".join(jan.clean(source))
+#     for contam in jan.dirt_ngrams:
+#         assert contam not in cleaned, contam
+# if __name__ == "__main__":
+#     test()
+#     # print_cpp()
+#     # test_cpp()
+#     # benchmark()

Prism/LLaDA/LLaDA_Baseline/dllm_eval/loggers/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .evaluation_tracker import EvaluationTracker
2	+ from .wandb_logger import WandbLogger

Prism/LLaDA/LLaDA_Baseline/dllm_eval/loggers/evaluation_tracker.py ADDED Viewed

	@@ -0,0 +1,530 @@

+import json
+import logging
+import os
+import re
+import time
+from collections import defaultdict
+from dataclasses import asdict, dataclass
+from datetime import datetime
+from pathlib import Path
+from datasets import load_dataset
+from datasets.utils.metadata import MetadataConfigs
+from huggingface_hub import (
+    DatasetCard,
+    DatasetCardData,
+    HfApi,
+    hf_hub_url,
+)
+from huggingface_hub.utils import build_hf_headers, get_session, hf_raise_for_status
+from dllm_eval.utils import (
+    get_file_datetime,
+    get_file_task_name,
+    get_results_filenames,
+    get_sample_results_filenames,
+    handle_non_serializable,
+    hash_string,
+    sanitize_list,
+    sanitize_model_name,
+    sanitize_task_name,
+)
+eval_logger = logging.getLogger(__name__)
+@dataclass(init=False)
+class GeneralConfigTracker:
+    """
+    Tracker for the evaluation parameters.
+    Attributes:
+        model_source (str): Source of the model (e.g. Hugging Face, GGUF, etc.)
+        model_name (str): Name of the model.
+        model_name_sanitized (str): Sanitized model name for directory creation.
+        start_time (float): Start time of the experiment. Logged at class init.
+        end_time (float): Start time of the experiment. Logged when calling [`GeneralConfigTracker.log_end_time`]
+        total_evaluation_time_seconds (str): Inferred total evaluation time in seconds (from the start and end times).
+    """
+    model_source: str = None
+    model_name: str = None
+    model_name_sanitized: str = None
+    system_instruction: str = None
+    system_instruction_sha: str = None
+    fewshot_as_multiturn: bool = None
+    chat_template: str = None
+    chat_template_sha: str = None
+    start_time: float = None
+    end_time: float = None
+    total_evaluation_time_seconds: str = None
+    def __init__(self) -> None:
+        """Starts the evaluation timer."""
+        self.start_time = time.perf_counter()
+    @staticmethod
+    def _get_model_name(model_args: str) -> str:
+        """Extracts the model name from the model arguments."""
+        def extract_model_name(model_args: str, key: str) -> str:
+            """Extracts the model name from the model arguments using a key."""
+            args_after_key = model_args.split(key)[1]
+            return args_after_key.split(",")[0]
+        # order does matter, e.g. peft and delta are provided together with pretrained
+        prefixes = ["peft=", "delta=", "pretrained=", "model=", "path=", "engine="]
+        for prefix in prefixes:
+            if prefix in model_args:
+                return extract_model_name(model_args, prefix)
+        return ""
+    def log_experiment_args(
+        self,
+        model_source: str,
+        model_args: str,
+        system_instruction: str,
+        chat_template: str,
+        fewshot_as_multiturn: bool,
+    ) -> None:
+        """Logs model parameters and job ID."""
+        self.model_source = model_source
+        self.model_name = GeneralConfigTracker._get_model_name(model_args)
+        self.model_name_sanitized = sanitize_model_name(self.model_name)
+        self.system_instruction = system_instruction
+        self.system_instruction_sha = (
+            hash_string(system_instruction) if system_instruction else None
+        )
+        self.chat_template = chat_template
+        self.chat_template_sha = hash_string(chat_template) if chat_template else None
+        self.fewshot_as_multiturn = fewshot_as_multiturn
+    def log_end_time(self) -> None:
+        """Logs the end time of the evaluation and calculates the total evaluation time."""
+        self.end_time = time.perf_counter()
+        self.total_evaluation_time_seconds = str(self.end_time - self.start_time)
+class EvaluationTracker:
+    """
+    Keeps track and saves relevant information of the evaluation process.
+    Compiles the data from trackers and writes it to files, which can be published to the Hugging Face hub if requested.
+    """
+    def __init__(
+        self,
+        output_path: str = None,
+        hub_results_org: str = "",
+        hub_repo_name: str = "",
+        details_repo_name: str = "",
+        results_repo_name: str = "",
+        push_results_to_hub: bool = False,
+        push_samples_to_hub: bool = False,
+        public_repo: bool = False,
+        token: str = "",
+        leaderboard_url: str = "",
+        point_of_contact: str = "",
+        gated: bool = False,
+    ) -> None:
+        """
+        Creates all the necessary loggers for evaluation tracking.
+        Args:
+            output_path (str): Path to save the results. If not provided, the results won't be saved.
+            hub_results_org (str): The Hugging Face organization to push the results to. If not provided, the results will be pushed to the owner of the Hugging Face token.
+            hub_repo_name (str): The name of the Hugging Face repository to push the results to. If not provided, the results will be pushed to `lm-eval-results`.
+            details_repo_name (str): The name of the Hugging Face repository to push the details to. If not provided, the results will be pushed to `lm-eval-results`.
+            result_repo_name (str): The name of the Hugging Face repository to push the results to. If not provided, the results will not be pushed and will be found in the details_hub_repo.
+            push_results_to_hub (bool): Whether to push the results to the Hugging Face hub.
+            push_samples_to_hub (bool): Whether to push the samples to the Hugging Face hub.
+            public_repo (bool): Whether to push the results to a public or private repository.
+            token (str): Token to use when pushing to the Hugging Face hub. This token should have write access to `hub_results_org`.
+            leaderboard_url (str): URL to the leaderboard on the Hugging Face hub on the dataset card.
+            point_of_contact (str): Contact information on the Hugging Face hub dataset card.
+            gated (bool): Whether to gate the repository.
+        """
+        self.general_config_tracker = GeneralConfigTracker()
+        self.output_path = output_path
+        self.push_results_to_hub = push_results_to_hub
+        self.push_samples_to_hub = push_samples_to_hub
+        self.public_repo = public_repo
+        self.leaderboard_url = leaderboard_url
+        self.point_of_contact = point_of_contact
+        self.api = HfApi(token=token) if token else None
+        self.gated_repo = gated
+        if not self.api and (push_results_to_hub or push_samples_to_hub):
+            raise ValueError(
+                "Hugging Face token is not defined, but 'push_results_to_hub' or 'push_samples_to_hub' is set to True. "
+                "Please provide a valid Hugging Face token by setting the HF_TOKEN environment variable."
+            )
+        if (
+            self.api
+            and hub_results_org == ""
+            and (push_results_to_hub or push_samples_to_hub)
+        ):
+            hub_results_org = self.api.whoami()["name"]
+            eval_logger.warning(
+                f"hub_results_org was not specified. Results will be pushed to '{hub_results_org}'."
+            )
+        if hub_repo_name == "":
+            details_repo_name = (
+                details_repo_name if details_repo_name != "" else "lm-eval-results"
+            )
+            results_repo_name = (
+                results_repo_name if results_repo_name != "" else details_repo_name
+            )
+        else:
+            details_repo_name = hub_repo_name
+            results_repo_name = hub_repo_name
+            eval_logger.warning(
+                "hub_repo_name was specified. Both details and results will be pushed to the same repository. Using hub_repo_name is no longer recommended, details_repo_name and results_repo_name should be used instead."
+            )
+        self.details_repo = f"{hub_results_org}/{details_repo_name}"
+        self.details_repo_private = f"{hub_results_org}/{details_repo_name}-private"
+        self.results_repo = f"{hub_results_org}/{results_repo_name}"
+        self.results_repo_private = f"{hub_results_org}/{results_repo_name}-private"
+    def save_results_aggregated(
+        self,
+        results: dict,
+        samples: dict,
+    ) -> None:
+        """
+        Saves the aggregated results and samples to the output path and pushes them to the Hugging Face hub if requested.
+        Args:
+            results (dict): The aggregated results to save.
+            samples (dict): The samples results to save.
+        """
+        self.general_config_tracker.log_end_time()
+        if self.output_path:
+            try:
+                eval_logger.info("Saving results aggregated")
+                # calculate cumulative hash for each task - only if samples are provided
+                task_hashes = {}
+                if samples:
+                    for task_name, task_samples in samples.items():
+                        sample_hashes = [
+                            s["doc_hash"] + s["prompt_hash"] + s["target_hash"]
+                            for s in task_samples
+                        ]
+                        task_hashes[task_name] = hash_string("".join(sample_hashes))
+                # update initial results dict
+                results.update({"task_hashes": task_hashes})
+                results.update(asdict(self.general_config_tracker))
+                dumped = json.dumps(
+                    results,
+                    indent=2,
+                    default=handle_non_serializable,
+                    ensure_ascii=False,
+                )
+                path = Path(self.output_path if self.output_path else Path.cwd())
+                self.date_id = datetime.now().isoformat().replace(":", "-")
+                if path.suffix == ".json":
+                    path.parent.mkdir(parents=True, exist_ok=True)
+                    file_results_aggregated = path.with_name(
+                        f"{path.stem}_{self.date_id}.json"
+                    )
+                else:
+                    path.mkdir(parents=True, exist_ok=True)
+                    file_results_aggregated = path.joinpath(
+                        f"results_{self.date_id}.json"
+                    )
+                file_results_aggregated.open("w", encoding="utf-8").write(dumped)
+                if self.api and self.push_results_to_hub:
+                    repo_id = (
+                        self.results_repo
+                        if self.public_repo
+                        else self.results_repo_private
+                    )
+                    self.api.create_repo(
+                        repo_id=repo_id,
+                        repo_type="dataset",
+                        private=not self.public_repo,
+                        exist_ok=True,
+                    )
+                    self.api.upload_file(
+                        repo_id=repo_id,
+                        path_or_fileobj=str(file_results_aggregated),
+                        path_in_repo=os.path.join(
+                            self.general_config_tracker.model_name,
+                            file_results_aggregated.name,
+                        ),
+                        repo_type="dataset",
+                        commit_message=f"Adding aggregated results for {self.general_config_tracker.model_name}",
+                    )
+                    eval_logger.info(
+                        "Successfully pushed aggregated results to the Hugging Face Hub. "
+                        f"You can find them at: {repo_id}"
+                    )
+            except Exception as e:
+                eval_logger.warning("Could not save results aggregated")
+                eval_logger.info(repr(e))
+        else:
+            eval_logger.info(
+                "Output path not provided, skipping saving results aggregated"
+            )
+    def save_results_samples(
+        self,
+        task_name: str,
+        samples: dict,
+    ) -> None:
+        """
+        Saves the samples results to the output path and pushes them to the Hugging Face hub if requested.
+        Args:
+            task_name (str): The task name to save the samples for.
+            samples (dict): The samples results to save.
+        """
+        if self.output_path:
+            try:
+                eval_logger.info(f"Saving per-sample results for: {task_name}")
+                path = Path(self.output_path if self.output_path else Path.cwd())
+                if path.suffix == ".json":
+                    path = path.parent
+                path.mkdir(parents=True, exist_ok=True)
+                file_results_samples = path.joinpath(
+                    f"samples_{task_name}_{self.date_id}.jsonl"
+                )
+                for sample in samples:
+                    # we first need to sanitize arguments and resps
+                    # otherwise we won't be able to load the dataset
+                    # using the datasets library
+                    arguments = {}
+                    for i, arg in enumerate(sample["arguments"]):
+                        arguments[f"gen_args_{i}"] = {}
+                        for j, tmp in enumerate(arg):
+                            arguments[f"gen_args_{i}"][f"arg_{j}"] = tmp
+                    sample["resps"] = sanitize_list(sample["resps"])
+                    sample["filtered_resps"] = sanitize_list(sample["filtered_resps"])
+                    sample["arguments"] = arguments
+                    sample["target"] = str(sample["target"])
+                    sample_dump = (
+                        json.dumps(
+                            sample,
+                            default=handle_non_serializable,
+                            ensure_ascii=False,
+                        )
+                        + "\n"
+                    )
+                    with open(file_results_samples, "a", encoding="utf-8") as f:
+                        f.write(sample_dump)
+                if self.api and self.push_samples_to_hub:
+                    repo_id = (
+                        self.details_repo
+                        if self.public_repo
+                        else self.details_repo_private
+                    )
+                    self.api.create_repo(
+                        repo_id=repo_id,
+                        repo_type="dataset",
+                        private=not self.public_repo,
+                        exist_ok=True,
+                    )
+                    try:
+                        if self.gated_repo:
+                            headers = build_hf_headers()
+                            r = get_session().put(
+                                url=f"https://huggingface.co/api/datasets/{repo_id}/settings",
+                                headers=headers,
+                                json={"gated": "auto"},
+                            )
+                            hf_raise_for_status(r)
+                    except Exception as e:
+                        eval_logger.warning("Could not gate the repository")
+                        eval_logger.info(repr(e))
+                    self.api.upload_folder(
+                        repo_id=repo_id,
+                        folder_path=str(path),
+                        path_in_repo=self.general_config_tracker.model_name_sanitized,
+                        repo_type="dataset",
+                        commit_message=f"Adding samples results for {task_name} to {self.general_config_tracker.model_name}",
+                    )
+                    eval_logger.info(
+                        f"Successfully pushed sample results for task: {task_name} to the Hugging Face Hub. "
+                        f"You can find them at: {repo_id}"
+                    )
+            except Exception as e:
+                eval_logger.warning("Could not save sample results")
+                eval_logger.info(repr(e))
+        else:
+            eval_logger.info("Output path not provided, skipping saving sample results")
+    def recreate_metadata_card(self) -> None:
+        """
+        Creates a metadata card for the evaluation results dataset and pushes it to the Hugging Face hub.
+        """
+        eval_logger.info("Recreating metadata card")
+        repo_id = self.details_repo if self.public_repo else self.details_repo_private
+        files_in_repo = self.api.list_repo_files(repo_id=repo_id, repo_type="dataset")
+        results_files = get_results_filenames(files_in_repo)
+        sample_files = get_sample_results_filenames(files_in_repo)
+        # Build a dictionary to store the latest evaluation datetime for:
+        # - Each tested model and its aggregated results
+        # - Each task and sample results, if existing
+        # i.e. {
+        #     "org__model_name__gsm8k": "2021-09-01T12:00:00",
+        #     "org__model_name__ifeval": "2021-09-01T12:00:00",
+        #     "org__model_name__results": "2021-09-01T12:00:00"
+        # }
+        latest_task_results_datetime = defaultdict(lambda: datetime.min.isoformat())
+        for file_path in sample_files:
+            file_path = Path(file_path)
+            filename = file_path.name
+            model_name = file_path.parent
+            task_name = get_file_task_name(filename)
+            results_datetime = get_file_datetime(filename)
+            task_name_sanitized = sanitize_task_name(task_name)
+            # Results and sample results for the same model and task will have the same datetime
+            samples_key = f"{model_name}__{task_name_sanitized}"
+            results_key = f"{model_name}__results"
+            latest_datetime = max(
+                latest_task_results_datetime[samples_key],
+                results_datetime,
+            )
+            latest_task_results_datetime[samples_key] = latest_datetime
+            latest_task_results_datetime[results_key] = max(
+                latest_task_results_datetime[results_key],
+                latest_datetime,
+            )
+        # Create metadata card
+        card_metadata = MetadataConfigs()
+        # Add the latest aggregated results to the metadata card for easy access
+        for file_path in results_files:
+            file_path = Path(file_path)
+            results_filename = file_path.name
+            model_name = file_path.parent
+            eval_date = get_file_datetime(results_filename)
+            eval_date_sanitized = re.sub(r"[^\w\.]", "_", eval_date)
+            results_filename = Path("**") / Path(results_filename).name
+            config_name = f"{model_name}__results"
+            sanitized_last_eval_date_results = re.sub(
+                r"[^\w\.]", "_", latest_task_results_datetime[config_name]
+            )
+            if eval_date_sanitized == sanitized_last_eval_date_results:
+                # Ensure that all results files are listed in the metadata card
+                current_results = card_metadata.get(config_name, {"data_files": []})
+                current_results["data_files"].append(
+                    {"split": eval_date_sanitized, "path": [str(results_filename)]}
+                )
+                card_metadata[config_name] = current_results
+                # If the results file is the newest, update the "latest" field in the metadata card
+                card_metadata[config_name]["data_files"].append(
+                    {"split": "latest", "path": [str(results_filename)]}
+                )
+        # Add the tasks details configs
+        for file_path in sample_files:
+            file_path = Path(file_path)
+            filename = file_path.name
+            model_name = file_path.parent
+            task_name = get_file_task_name(filename)
+            eval_date = get_file_datetime(filename)
+            task_name_sanitized = sanitize_task_name(task_name)
+            eval_date_sanitized = re.sub(r"[^\w\.]", "_", eval_date)
+            results_filename = Path("**") / Path(filename).name
+            config_name = f"{model_name}__{task_name_sanitized}"
+            sanitized_last_eval_date_results = re.sub(
+                r"[^\w\.]", "_", latest_task_results_datetime[config_name]
+            )
+            if eval_date_sanitized == sanitized_last_eval_date_results:
+                # Ensure that all sample results files are listed in the metadata card
+                current_details_for_task = card_metadata.get(
+                    config_name, {"data_files": []}
+                )
+                current_details_for_task["data_files"].append(
+                    {"split": eval_date_sanitized, "path": [str(results_filename)]}
+                )
+                card_metadata[config_name] = current_details_for_task
+                # If the samples results file is the newest, update the "latest" field in the metadata card
+                card_metadata[config_name]["data_files"].append(
+                    {"split": "latest", "path": [str(results_filename)]}
+                )
+        # Get latest results and extract info to update metadata card examples
+        latest_datetime = max(latest_task_results_datetime.values())
+        latest_model_name = max(
+            latest_task_results_datetime, key=lambda k: latest_task_results_datetime[k]
+        )
+        last_results_file = [
+            f for f in results_files if latest_datetime.replace(":", "-") in f
+        ][0]
+        last_results_file_path = hf_hub_url(
+            repo_id=repo_id, filename=last_results_file, repo_type="dataset"
+        )
+        latest_results_file = load_dataset(
+            "json", data_files=last_results_file_path, split="train"
+        )
+        results_dict = latest_results_file["results"][0]
+        new_dictionary = {"all": results_dict}
+        new_dictionary.update(results_dict)
+        results_string = json.dumps(new_dictionary, indent=4)
+        dataset_summary = (
+            "Dataset automatically created during the evaluation run of model "
+        )
+        if self.general_config_tracker.model_source == "hf":
+            dataset_summary += f"[{self.general_config_tracker.model_name}](https://huggingface.co/{self.general_config_tracker.model_name})\n"
+        else:
+            dataset_summary += f"{self.general_config_tracker.model_name}\n"
+        dataset_summary += (
+            f"The dataset is composed of {len(card_metadata) - 1} configuration(s), each one corresponding to one of the evaluated task.\n\n"
+            f"The dataset has been created from {len(results_files)} run(s). Each run can be found as a specific split in each "
+            'configuration, the split being named using the timestamp of the run.The "train" split is always pointing to the latest results.\n\n'
+            'An additional configuration "results" store all the aggregated results of the run.\n\n'
+            "To load the details from a run, you can for instance do the following:\n"
+        )
+        if self.general_config_tracker.model_source == "hf":
+            dataset_summary += (
+                "```python\nfrom datasets import load_dataset\n"
+                f'data = load_dataset(\n\t"{repo_id}",\n\tname="{latest_model_name}",\n\tsplit="latest"\n)\n```\n\n'
+            )
+        dataset_summary += (
+            "## Latest results\n\n"
+            f"These are the [latest results from run {latest_datetime}]({last_results_file_path.replace('/resolve/', '/blob/')}) "
+            "(note that there might be results for other tasks in the repos if successive evals didn't cover the same tasks. "
+            'You find each in the results and the "latest" split for each eval):\n\n'
+            f"```python\n{results_string}\n```"
+        )
+        card_data = DatasetCardData(
+            dataset_summary=dataset_summary,
+            repo_url=f"https://huggingface.co/{self.general_config_tracker.model_name}",
+            pretty_name=f"Evaluation run of {self.general_config_tracker.model_name}",
+            leaderboard_url=self.leaderboard_url,
+            point_of_contact=self.point_of_contact,
+        )
+        card_metadata.to_dataset_card_data(card_data)
+        card = DatasetCard.from_template(
+            card_data,
+            pretty_name=card_data.pretty_name,
+        )
+        card.push_to_hub(repo_id, repo_type="dataset")

Prism/LLaDA/LLaDA_Baseline/dllm_eval/loggers/utils.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import logging
+import os
+import re
+import subprocess
+from importlib.metadata import version
+from pathlib import Path
+from typing import Any, Dict, Optional, Tuple, Union
+import numpy as np
+from torch.utils.collect_env import get_pretty_env_info
+from transformers import __version__ as trans_version
+logger = logging.getLogger(__name__)
+def remove_none_pattern(input_string: str) -> Tuple[str, bool]:
+    """Remove the ',none' substring from the input_string if it exists at the end.
+    Args:
+        input_string (str): The input string from which to remove the ',none' substring.
+    Returns:
+        Tuple[str, bool]: A tuple containing the modified input_string with the ',none' substring removed
+                          and a boolean indicating whether the modification was made (True) or not (False).
+    """
+    # Define the pattern to match ',none' at the end of the string
+    pattern = re.compile(r",none$")
+    # Use sub() to replace ',none' with an empty string
+    result = re.sub(pattern, "", input_string)
+    # check if the input_string changed
+    removed = result != input_string
+    return result, removed
+def _handle_non_serializable(o: Any) -> Union[int, str, list]:
+    """Handle non-serializable objects by converting them to serializable types.
+    Args:
+        o (Any): The object to be handled.
+    Returns:
+        Union[int, str, list]: The converted object. If the object is of type np.int64 or np.int32,
+            it will be converted to int. If the object is of type set, it will be converted
+            to a list. Otherwise, it will be converted to str.
+    """
+    if isinstance(o, np.int64) or isinstance(o, np.int32):
+        return int(o)
+    elif isinstance(o, set):
+        return list(o)
+    else:
+        return str(o)
+def get_commit_from_path(repo_path: Union[Path, str]) -> Optional[str]:
+    try:
+        git_folder = Path(repo_path, ".git")
+        if git_folder.is_file():
+            git_folder = Path(
+                git_folder.parent,
+                git_folder.read_text(encoding="utf-8").split("\n")[0].split(" ")[-1],
+            )
+        if Path(git_folder, "HEAD").exists():
+            head_name = (
+                Path(git_folder, "HEAD")
+                .read_text(encoding="utf-8")
+                .split("\n")[0]
+                .split(" ")[-1]
+            )
+            head_ref = Path(git_folder, head_name)
+            git_hash = head_ref.read_text(encoding="utf-8").replace("\n", "")
+        else:
+            git_hash = None
+    except Exception as err:
+        logger.debug(
+            f"Failed to retrieve a Git commit hash from path: {str(repo_path)}. Error: {err}"
+        )
+        return None
+    return git_hash
+def get_git_commit_hash():
+    """
+    Gets the git commit hash of your current repo (if it exists).
+    Source: https://github.com/EleutherAI/gpt-neox/blob/b608043be541602170bfcfb8ec9bf85e8a0799e0/megatron/neox_arguments/neox_args.py#L42
+    """
+    try:
+        git_hash = subprocess.check_output(["git", "describe", "--always"]).strip()
+        git_hash = git_hash.decode()
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        # FileNotFoundError occurs when git not installed on system
+        git_hash = get_commit_from_path(os.getcwd())  # git hash of repo if exists
+    return git_hash
+def add_env_info(storage: Dict[str, Any]):
+    try:
+        pretty_env_info = get_pretty_env_info()
+    except Exception as err:
+        pretty_env_info = str(err)
+    try:
+        dllm_eval_version = version("dllm_eval")
+    except Exception as err:
+        dllm_eval_version = str(err)
+    transformers_version = trans_version
+    upper_dir_commit = get_commit_from_path(
+        Path(os.getcwd(), "..")
+    )  # git hash of upper repo if exists
+    added_info = {
+        "pretty_env_info": pretty_env_info,
+        "transformers_version": transformers_version,
+        "dllm_eval_version": dllm_eval_version,
+        "upper_git_hash": upper_dir_commit,  # in case this repo is submodule
+    }
+    storage.update(added_info)
+def add_tokenizer_info(storage: Dict[str, Any], lm):
+    if getattr(lm, "tokenizer", False):
+        try:
+            tokenizer_info = {
+                "tokenizer_pad_token": [
+                    lm.tokenizer.pad_token,
+                    str(lm.tokenizer.pad_token_id),
+                ],
+                "tokenizer_eos_token": [
+                    lm.tokenizer.eos_token,
+                    str(lm.tokenizer.eos_token_id),
+                ],
+                "tokenizer_bos_token": [
+                    lm.tokenizer.bos_token,
+                    str(lm.tokenizer.bos_token_id),
+                ],
+                "eot_token_id": getattr(lm, "eot_token_id", None),
+                "max_length": getattr(lm, "max_length", None),
+            }
+            storage.update(tokenizer_info)
+        except Exception as err:
+            logger.debug(
+                f"Logging detailed tokenizer info failed with {err}, skipping..."
+            )
+        # seems gguf and textsynth do not have tokenizer
+    else:
+        logger.debug(
+            "LM does not have a 'tokenizer' attribute, not logging tokenizer metadata to results."
+        )

Prism/LLaDA/LLaDA_Baseline/dllm_eval/loggers/wandb_logger.py ADDED Viewed

	@@ -0,0 +1,358 @@

+import copy
+import json
+import logging
+from typing import Any, Dict, List, Literal, Tuple
+import numpy as np
+import pandas as pd
+from packaging.version import Version
+from dllm_eval.loggers.utils import _handle_non_serializable, remove_none_pattern
+logger = logging.getLogger(__name__)
+def get_wandb_printer() -> Literal["Printer"]:
+    """Returns a wandb printer instance for pretty stdout."""
+    from wandb.sdk.lib.printer import new_printer
+    printer = new_printer()
+    return printer
+class WandbLogger:
+    def __init__(self, init_args=None, config_args=None) -> None:
+        """Attaches to wandb logger if already initialized. Otherwise, passes init_args to wandb.init() and config_args to wandb.config.update()
+        Args:
+            init_args Optional[Dict]: Arguments for init configuration.
+            config_args Optional[Dict]: Arguments for config
+        Parse and log the results returned from evaluator.simple_evaluate() with:
+            wandb_logger.post_init(results)
+            wandb_logger.log_eval_result()
+            wandb_logger.log_eval_samples(results["samples"])
+        """
+        try:
+            import wandb
+            assert Version(wandb.__version__) >= Version("0.13.6")
+            if Version(wandb.__version__) < Version("0.13.6"):
+                wandb.require("report-editing:v0")
+        except Exception as e:
+            logger.warning(
+                "To use the wandb reporting functionality please install wandb>=0.13.6.\n"
+                "To install the latest version of wandb run `pip install wandb --upgrade`\n"
+                f"{e}"
+            )
+        self.wandb_args: Dict[str, Any] = init_args or {}
+        self.wandb_config_args: Dict[str, Any] = config_args or {}
+        # pop the step key from the args to save for all logging calls
+        self.step = self.wandb_args.pop("step", None)
+        # initialize a W&B run
+        if wandb.run is None:
+            self.run = wandb.init(**self.wandb_args)
+            if self.wandb_config_args:
+                self.run.config.update(self.wandb_config_args)
+        else:
+            self.run = wandb.run
+        self.printer = get_wandb_printer()
+    def post_init(self, results: Dict[str, Any]) -> None:
+        self.results: Dict[str, Any] = copy.deepcopy(results)
+        self.task_names: List[str] = list(results.get("results", {}).keys())
+        self.group_names: List[str] = list(results.get("groups", {}).keys())
+    def _get_config(self) -> Dict[str, Any]:
+        """Get configuration parameters."""
+        self.task_configs = self.results.get("configs", {})
+        cli_configs = self.results.get("config", {})
+        configs = {
+            "task_configs": self.task_configs,
+            "cli_configs": cli_configs,
+        }
+        return configs
+    def _sanitize_results_dict(self) -> Tuple[Dict[str, str], Dict[str, Any]]:
+        """Sanitize the results dictionary."""
+        _results = copy.deepcopy(self.results.get("results", dict()))
+        # Remove None from the metric string name
+        tmp_results = copy.deepcopy(_results)
+        for task_name in self.task_names:
+            task_result = tmp_results.get(task_name, dict())
+            for metric_name, metric_value in task_result.items():
+                _metric_name, removed = remove_none_pattern(metric_name)
+                if removed:
+                    _results[task_name][_metric_name] = metric_value
+                    _results[task_name].pop(metric_name)
+        # remove string valued keys from the results dict
+        wandb_summary = {}
+        for task in self.task_names:
+            task_result = _results.get(task, dict())
+            for metric_name, metric_value in task_result.items():
+                if isinstance(metric_value, str):
+                    wandb_summary[f"{task}/{metric_name}"] = metric_value
+        for summary_metric, summary_value in wandb_summary.items():
+            _task, _summary_metric = summary_metric.split("/")
+            _results[_task].pop(_summary_metric)
+        tmp_results = copy.deepcopy(_results)
+        for task_name, task_results in tmp_results.items():
+            for metric_name, metric_value in task_results.items():
+                _results[f"{task_name}/{metric_name}"] = metric_value
+                _results[task_name].pop(metric_name)
+        for task in self.task_names:
+            _results.pop(task)
+        return wandb_summary, _results
+    def _log_results_as_table(self) -> None:
+        """Generate and log evaluation results as a table to W&B."""
+        columns = [
+            "Version",
+            "Filter",
+            "num_fewshot",
+            "Metric",
+            "Value",
+            "Stderr",
+        ]
+        def make_table(columns: List[str], key: str = "results"):
+            import wandb
+            table = wandb.Table(columns=columns)
+            results = copy.deepcopy(self.results)
+            for k, dic in results.get(key).items():
+                if k in self.group_names and not key == "groups":
+                    continue
+                version = results.get("versions").get(k)
+                if version == "N/A":
+                    version = None
+                n = results.get("n-shot").get(k)
+                for (mf), v in dic.items():
+                    m, _, f = mf.partition(",")
+                    if m.endswith("_stderr"):
+                        continue
+                    if m == "alias":
+                        continue
+                    if m + "_stderr" + "," + f in dic:
+                        se = dic[m + "_stderr" + "," + f]
+                        if se != "N/A":
+                            se = "%.4f" % se
+                        table.add_data(*[k, version, f, n, m, str(v), str(se)])
+                    else:
+                        table.add_data(*[k, version, f, n, m, str(v), ""])
+            return table
+        # log the complete eval result to W&B Table
+        table = make_table(["Tasks"] + columns, "results")
+        self.run.log({"evaluation/eval_results": table}, step=self.step)
+        if "groups" in self.results.keys():
+            table = make_table(["Groups"] + columns, "groups")
+            self.run.log({"evaluation/group_eval_results": table}, step=self.step)
+    def _log_results_as_artifact(self) -> None:
+        """Log results as JSON artifact to W&B."""
+        import wandb
+        dumped = json.dumps(
+            self.results, indent=2, default=_handle_non_serializable, ensure_ascii=False
+        )
+        artifact = wandb.Artifact("results", type="eval_results")
+        with artifact.new_file("results.json", mode="w", encoding="utf-8") as f:
+            f.write(dumped)
+        self.run.log_artifact(artifact)
+    def log_eval_result(self) -> None:
+        """Log evaluation results to W&B."""
+        # Log configs to wandb
+        configs = self._get_config()
+        self.run.config.update(configs, allow_val_change=self.step is not None)
+        wandb_summary, self.wandb_results = self._sanitize_results_dict()
+        # update wandb.run.summary with items that were removed
+        self.run.summary.update(wandb_summary)
+        # Log the evaluation metrics to wandb
+        self.run.log(self.wandb_results, step=self.step)
+        # Log the evaluation metrics as W&B Table
+        self._log_results_as_table()
+        # Log the results dict as json to W&B Artifacts
+        self._log_results_as_artifact()
+    def _generate_dataset(
+        self, data: List[Dict[str, Any]], config: Dict[str, Any]
+    ) -> pd.DataFrame:
+        """Generate a dataset from evaluation data.
+        Args:
+            data (List[Dict[str, Any]]): The data to generate a dataset for.
+            config (Dict[str, Any]): The configuration of the task.
+        Returns:
+            pd.DataFrame: A dataframe that is ready to be uploaded to W&B.
+        """
+        ids = [x["doc_id"] for x in data]
+        labels = [x["target"] for x in data]
+        instance = [""] * len(ids)
+        resps = [""] * len(ids)
+        filtered_resps = [""] * len(ids)
+        model_outputs = {}
+        metrics_list = config["metric_list"]
+        metrics = {}
+        for metric in metrics_list:
+            metric = metric.get("metric")
+            if metric in ["word_perplexity", "byte_perplexity", "bits_per_byte"]:
+                metrics[f"{metric}_loglikelihood"] = [x[metric][0] for x in data]
+                if metric in ["byte_perplexity", "bits_per_byte"]:
+                    metrics[f"{metric}_bytes"] = [x[metric][1] for x in data]
+                else:
+                    metrics[f"{metric}_words"] = [x[metric][1] for x in data]
+            else:
+                metrics[metric] = [x[metric] for x in data]
+        if config["output_type"] == "loglikelihood":
+            instance = [x["arguments"][0][0] for x in data]
+            labels = [x["arguments"][0][1] for x in data]
+            resps = [
+                f"log probability of continuation is {x['resps'][0][0][0]} "
+                + "\n\n"
+                + "continuation will {} generated with greedy sampling".format(
+                    "not be" if not x["resps"][0][0][1] else "be"
+                )
+                for x in data
+            ]
+            filtered_resps = [
+                f"log probability of continuation is {x['filtered_resps'][0][0]} "
+                + "\n\n"
+                + "continuation will {} generated with greedy sampling".format(
+                    "not be" if not x["filtered_resps"][0][1] else "be"
+                )
+                for x in data
+            ]
+        elif config["output_type"] == "multiple_choice":
+            instance = [x["arguments"][0][0] for x in data]
+            choices = [
+                "\n".join([f"{idx}. {y[1]}" for idx, y in enumerate(x["arguments"])])
+                for x in data
+            ]
+            resps = [np.argmax([n[0][0] for n in x["resps"]]) for x in data]
+            filtered_resps = [
+                np.argmax([n[0] for n in x["filtered_resps"]]) for x in data
+            ]
+        elif config["output_type"] == "loglikelihood_rolling":
+            instance = [x["arguments"][0][0] for x in data]
+            resps = [x["resps"][0][0] for x in data]
+            filtered_resps = [x["filtered_resps"][0] for x in data]
+        elif config["output_type"] == "generate_until":
+            instance = [x["arguments"][0][0] for x in data]
+            resps = [x["resps"][0][0] for x in data]
+            filtered_resps = [x["filtered_resps"][0] for x in data]
+        model_outputs["raw_predictions"] = resps
+        model_outputs["filtered_predictions"] = filtered_resps
+        df_data = {
+            "id": ids,
+            "data": instance,
+        }
+        if config["output_type"] == "multiple_choice":
+            df_data["choices"] = choices
+        tmp_data = {
+            "input_len": [len(x) for x in instance],
+            "labels": labels,
+            "output_type": config["output_type"],
+        }
+        df_data.update(tmp_data)
+        df_data.update(model_outputs)
+        df_data.update(metrics)
+        return pd.DataFrame(df_data)
+    def _log_samples_as_artifact(
+        self, data: List[Dict[str, Any]], task_name: str
+    ) -> None:
+        import wandb
+        # log the samples as an artifact
+        dumped = json.dumps(
+            data,
+            indent=2,
+            default=_handle_non_serializable,
+            ensure_ascii=False,
+        )
+        artifact = wandb.Artifact(f"{task_name}", type="samples_by_task")
+        with artifact.new_file(
+            f"{task_name}_eval_samples.json", mode="w", encoding="utf-8"
+        ) as f:
+            f.write(dumped)
+        self.run.log_artifact(artifact)
+        # artifact.wait()
+    def log_eval_samples(self, samples: Dict[str, List[Dict[str, Any]]]) -> None:
+        """Log evaluation samples to W&B.
+        Args:
+            samples (Dict[str, List[Dict[str, Any]]]): Evaluation samples for each task.
+        """
+        task_names: List[str] = [
+            x for x in self.task_names if x not in self.group_names
+        ]
+        ungrouped_tasks = []
+        tasks_by_groups = {}
+        for task_name in task_names:
+            group_names = self.task_configs[task_name].get("group", None)
+            if group_names:
+                if isinstance(group_names, str):
+                    group_names = [group_names]
+                for group_name in group_names:
+                    if not tasks_by_groups.get(group_name):
+                        tasks_by_groups[group_name] = [task_name]
+                    else:
+                        tasks_by_groups[group_name].append(task_name)
+            else:
+                ungrouped_tasks.append(task_name)
+        for task_name in ungrouped_tasks:
+            eval_preds = samples[task_name]
+            # log the samples as a W&B Table
+            df = self._generate_dataset(eval_preds, self.task_configs.get(task_name))
+            self.run.log({f"{task_name}_eval_results": df}, step=self.step)
+            # log the samples as a json file as W&B Artifact
+            self._log_samples_as_artifact(eval_preds, task_name)
+        for group, grouped_tasks in tasks_by_groups.items():
+            grouped_df = pd.DataFrame()
+            for task_name in grouped_tasks:
+                eval_preds = samples[task_name]
+                df = self._generate_dataset(
+                    eval_preds, self.task_configs.get(task_name)
+                )
+                df["group"] = group
+                df["task"] = task_name
+                grouped_df = pd.concat([grouped_df, df], ignore_index=True)
+                # log the samples as a json file as W&B Artifact
+                self._log_samples_as_artifact(eval_preds, task_name)
+            self.run.log({f"{group}_eval_results": grouped_df}, step=self.step)

Prism/LLaDA/LLaDA_Baseline/dllm_eval/models/LLaDA.py ADDED Viewed

	@@ -0,0 +1,786 @@

+import logging
+import os
+from datetime import timedelta
+from typing import Dict, List, Literal, Optional, Tuple, Union, TypeVar
+import torch
+import torch.nn.functional as F
+import numpy as np
+import transformers
+import json
+from accelerate import (
+    Accelerator,
+    InitProcessGroupKwargs,
+)
+from datasets import Dataset
+from accelerate.utils import get_max_memory
+from packaging import version
+from tqdm import tqdm
+import torch.distributed as dist
+from transformers.models.auto.modeling_auto import (
+    MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
+    MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
+)
+from dllm_eval.api.instance import Instance
+from dllm_eval.api.model import LM, TemplateLM
+from dllm_eval.api.registry import register_model
+from dllm_eval.models.utils import get_dtype, configure_pad_token
+try:
+    from .hts_sampler import HTSSampler
+except ImportError:
+    HTSSampler = None
+eval_logger = logging.getLogger(__name__)
+T = TypeVar("T", bound="LM")
+def add_gumbel_noise(logits, temperature):
+    """Add Gumbel noise for sampling"""
+    if temperature == 0.0:
+        return logits
+    logits = logits.to(torch.float32)
+    noise = torch.rand_like(logits, dtype=torch.float32)
+    gumbel_noise = (-torch.log(noise)) ** temperature
+    return logits.exp() / gumbel_noise
+def get_num_transfer_tokens(mask_index, steps):
+    """Calculate number of tokens to transfer at each step"""
+    mask_num = mask_index.sum(dim=1, keepdim=True)
+    base = mask_num // steps
+    remainder = mask_num % steps
+    num_transfer_tokens = base.expand(-1, steps).clone()
+    if remainder.sum() > 0:
+        indices = torch.arange(steps, device=mask_index.device)
+        mask = indices.unsqueeze(0) < remainder
+        num_transfer_tokens[mask] += 1
+    return num_transfer_tokens.to(torch.int64)
+@torch.no_grad()
+def generate_llada_v1(model, prompt, attention_mask=None, steps=128, gen_length=128,
+                      block_length=128, temperature=0., cfg_scale=0.,
+                      remasking='low_confidence', mask_id=126336,
+                      logits_eos_inf=False, confidence_eos_eot_inf=False):
+    """
+    LLaDA v1 generation function
+    This is the original generate function from LLaDA v1
+    """
+    x = torch.full((prompt.shape[0], prompt.shape[1] + gen_length), mask_id,
+                   dtype=torch.long).to(model.device)
+    x[:, :prompt.shape[1]] = prompt.clone()
+    if attention_mask is not None:
+        attention_mask = torch.cat([
+            attention_mask,
+            torch.ones((prompt.shape[0], gen_length), dtype=attention_mask.dtype,
+                      device=model.device)
+        ], dim=-1)
+    prompt_index = (x != mask_id)
+    assert gen_length % block_length == 0
+    num_blocks = gen_length // block_length
+    assert steps % num_blocks == 0
+    steps_per_block = steps // num_blocks
+    for num_block in range(num_blocks):
+        block_mask_index = (x[:, prompt.shape[1] + num_block * block_length:
+                              prompt.shape[1] + (num_block + 1) * block_length] == mask_id)
+        num_transfer_tokens = get_num_transfer_tokens(block_mask_index, steps_per_block)
+        for i in range(steps_per_block):
+            mask_index = (x == mask_id)
+            if cfg_scale > 0.:
+                un_x = x.clone()
+                un_x[prompt_index] = mask_id
+                x_ = torch.cat([x, un_x], dim=0)
+                if attention_mask is not None:
+                    attention_mask_ = torch.cat([attention_mask, attention_mask], dim=0)
+                logits = model(x_, attention_mask=attention_mask_).logits
+                logits, un_logits = torch.chunk(logits, 2, dim=0)
+                logits = un_logits + (cfg_scale + 1) * (logits - un_logits)
+            else:
+                logits = model(x, attention_mask=attention_mask).logits
+            if logits_eos_inf:
+                logits[:, :, 126081] = -torch.inf
+            logits_with_noise = add_gumbel_noise(logits, temperature=temperature)
+            x0 = torch.argmax(logits_with_noise, dim=-1)
+            if confidence_eos_eot_inf:
+                logits_with_noise[:, :, 126081] = logits[:, :, 126348] = -torch.inf
+            if remasking == 'low_confidence':
+                p = F.softmax(logits, dim=-1)
+                x0_p = torch.squeeze(
+                    torch.gather(p, dim=-1, index=torch.unsqueeze(x0, -1)), -1)
+            elif remasking == 'random':
+                x0_p = torch.rand((x0.shape[0], x0.shape[1]), device=x0.device)
+            else:
+                raise NotImplementedError(remasking)
+            x0_p[:, prompt.shape[1] + (num_block + 1) * block_length:] = -np.inf
+            x0 = torch.where(mask_index, x0, x)
+            confidence = torch.where(mask_index, x0_p, -np.inf)
+            transfer_index = torch.zeros_like(x0, dtype=torch.bool, device=x0.device)
+            for j in range(confidence.shape[0]):
+                _, select_index = torch.topk(confidence[j], k=num_transfer_tokens[j, i])
+                transfer_index[j, select_index] = True
+            x[transfer_index] = x0[transfer_index]
+    return x
+@register_model("LLaDA")
+class LLaDA(TemplateLM):
+    AUTO_MODEL_CLASS = transformers.AutoModel
+    _DEFAULT_MAX_LENGTH = 20480
+    def __init__(
+        self,
+        pretrained: Union[str, transformers.PreTrainedModel],
+        backend: Literal["default", "causal", "seq2seq"] = "causal",
+        revision: Optional[str] = "main",
+        subfolder: Optional[str] = None,
+        tokenizer: Optional[
+            Union[
+                str,
+                transformers.PreTrainedTokenizer,
+                transformers.PreTrainedTokenizerFast,
+            ]
+        ] = None,
+        truncation: Optional[bool] = False,
+        logits_cache: bool = True,
+        max_length: Optional[int] = None,
+        device: Optional[str] = "cuda",
+        dtype: Optional[Union[str, torch.dtype]] = "auto",
+        batch_size: Optional[Union[int]] = 1,
+        max_batch_size: Optional[int] = 64,
+        trust_remote_code: Optional[bool] = True,
+        use_fast_tokenizer: Optional[bool] = True,
+        add_bos_token: Optional[bool] = False,
+        escape_until: Optional[bool] = False,
+        prefix_token_id: Optional[int] = None,
+        parallelize: Optional[bool] = False,
+        max_memory_per_gpu: Optional[Union[int, str]] = None,
+        max_cpu_memory: Optional[Union[int, str]] = None,
+        offload_folder: Optional[Union[str, os.PathLike]] = "./offload",
+        peft: Optional[str] = None,
+        delta: Optional[str] = None,
+        autogptq: Optional[Union[bool, str]] = False,
+        gptqmodel: Optional[bool] = False,
+        gguf_file: Optional[str] = None,
+        mc_num: int = 1024,
+        remasking: str = "low_confidence",
+        mask_id: int = 126336,  # LLaDA v1 default mask_id
+        is_check_greedy: bool = True,
+        assistant_prefix: Optional[str] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        self.mc_num = mc_num
+        self.mask_id = mask_id
+        self.remasking = remasking
+        self.pretrained = pretrained
+        self.is_check_greedy = is_check_greedy
+        self.assistant_prefix = assistant_prefix
+        self.add_bos_token = add_bos_token
+        self.escape_until = escape_until
+        if not isinstance(pretrained, str):
+            eval_logger.warning(
+                "`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored."
+            )
+            assert not parallelize, (
+                "`parallelize=True` is not compatible with passing pre-initialized model to `pretrained`"
+            )
+            self._model = pretrained
+            self._device = self._model.device
+            self._config = self._model.config
+            gpus = 0
+        else:
+            assert isinstance(device, str)
+            assert isinstance(pretrained, str)
+            assert isinstance(batch_size, (int, str))
+            gpus = torch.cuda.device_count()
+            accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52))
+            accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs])
+            if accelerator.num_processes > 1:
+                self.accelerator = accelerator
+            if "npu" in accelerator.device.type:
+                gpus = torch.npu.device_count()
+            if not (parallelize or accelerator.num_processes > 1):
+                device_list = set(
+                    ["cuda", "cpu"]
+                    + [f"cuda:{i}" for i in range(gpus)]
+                    + ["mps", "mps:0"]
+                    + [f"npu:{i}" for i in range(gpus)]
+                )
+                if device and device in device_list:
+                    self._device = torch.device(device)
+                    eval_logger.info(f"Using device '{device}'")
+                    if device in ("mps", "mps:0") and version.parse(
+                        torch.__version__
+                    ) < version.parse("2.1"):
+                        raise RuntimeError(
+                            f"mps requires torch >= 2.1. You have {torch.__version__}"
+                        )
+                else:
+                    eval_logger.info("Device not specified")
+                    eval_logger.info(f"Cuda Available? {torch.cuda.is_available()}")
+                    self._device = (
+                        torch.device("cuda")
+                        if torch.cuda.is_available()
+                        else torch.device("cpu")
+                    )
+            else:
+                if device != "cuda":
+                    eval_logger.info(
+                        f"Using `accelerate launch` or `parallelize=True`, device '{device}' will be overridden when placing model."
+                    )
+                self._device = (
+                    self.accelerator.device
+                    if hasattr(self, "accelerator")
+                    else torch.device(device)
+                )
+            revision = str(revision)
+            revision = revision + ("/" + subfolder if subfolder is not None else "")
+            self._get_config(
+                pretrained,
+                revision=revision,
+                trust_remote_code=trust_remote_code,
+                gguf_file=gguf_file,
+            )
+        self._get_backend(
+            config=self.config, backend=backend, trust_remote_code=trust_remote_code
+        )
+        self._create_tokenizer(
+            pretrained,
+            tokenizer,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+            use_fast_tokenizer=use_fast_tokenizer,
+            gguf_file=gguf_file,
+            add_bos_token=add_bos_token,
+        )
+        if isinstance(pretrained, str):
+            self._create_model(
+                pretrained=pretrained,
+                revision=revision,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+                parallelize=parallelize,
+                gpus=gpus,
+                max_memory_per_gpu=max_memory_per_gpu,
+                max_cpu_memory=max_cpu_memory,
+                offload_folder=offload_folder,
+                peft=peft,
+                delta=delta,
+                autogptq=autogptq,
+                gptqmodel=gptqmodel,
+                gguf_file=gguf_file,
+                **kwargs,
+            )
+        if isinstance(self.model, torch.nn.Module):
+            self.model.eval()
+            self.model.tie_weights()
+        self.truncation = truncation
+        self.logits_cache = logits_cache
+        self.vocab_size = self.tokenizer.vocab_size
+        self.tokenizer = configure_pad_token(self.tokenizer, model_config=self.config)
+        self.add_bos_token = add_bos_token
+        if "gemma" in getattr(self.config, "model_type", ""):
+            self.add_bos_token = True
+            eval_logger.info(
+                f"Model type is '{self.config.model_type}', part of the Gemma family--a BOS token will be used."
+            )
+        self._max_length = max_length
+        self.pretrained = pretrained
+        self.delta = delta
+        self.peft = peft
+        self.revision = revision
+        self.batch_schedule = 1
+        self.batch_sizes = {}
+        self.max_batch_size = max_batch_size
+        if str(batch_size).startswith("auto"):
+            batch_size = batch_size.split(":")
+            self.batch_size_per_gpu = batch_size[0]
+            self.batch_schedule = float(batch_size[1]) if len(batch_size) > 1 else 1
+        else:
+            self.batch_size_per_gpu = int(batch_size)
+        if isinstance(pretrained, str):
+            if gpus >= 1 or str(self.device) == "mps":
+                if not (parallelize or autogptq or hasattr(self, "accelerator")):
+                    try:
+                        self.model.to(self.device)
+                    except ValueError:
+                        eval_logger.debug(
+                            "Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes` or `device_map` is provided."
+                        )
+            if gpus > 1:
+                if hasattr(self, "accelerator") and self.accelerator.num_processes > 1:
+                    if parallelize:
+                        eval_logger.warning(
+                            "You are both using a HF Accelerate `device_map` and launching via `accelerate launch`."
+                        )
+                    elif gpus > self.accelerator.num_processes:
+                        eval_logger.warning(
+                            "WARNING: The number of total system GPUs does not match the number of spawned processes."
+                        )
+                    self._device = torch.device(f"{self.accelerator.device}")
+                    self._rank = self.accelerator.local_process_index
+                    self._world_size = self.accelerator.num_processes
+                else:
+                    self._rank = 0
+                    self._world_size = 1
+            else:
+                self._rank = 0
+                self._world_size = 1
+        else:
+            eval_logger.warning(
+                "Passed an already-initialized model through `pretrained`, assuming single-process call."
+            )
+            self._rank = 0
+            self._world_size = 1
+        self.custom_prefix_token_id = prefix_token_id
+        if prefix_token_id is not None:
+            eval_logger.info(
+                f"Loglikelihood prefix token id used in evaluation: {self.prefix_token_id}"
+            )
+        self.is_first_inference = True
+        if HTSSampler is not None:
+            self.hts_sampler = HTSSampler(self.model, self.tokenizer, device=self.device)
+            eval_logger.info("HTSSampler initialized successfully.")
+    # Copy all the property and helper methods from LLaDA2
+    @property
+    def rank(self):
+        if hasattr(self, "_rank"):
+            return self._rank
+        if hasattr(self, "accelerator"):
+            return self.accelerator.local_process_index
+        return int(os.environ.get("LOCAL_RANK", 0))
+    @property
+    def world_size(self):
+        if hasattr(self, "_world_size"):
+            return self._world_size
+        if hasattr(self, "accelerator"):
+            return self.accelerator.num_processes
+        return int(os.environ.get("WORLD_SIZE", 1))
+    def _get_accelerate_args(
+        self,
+        parallelize: Optional[bool] = None,
+        device_map: Optional[str] = "auto",
+        max_memory_per_gpu: Optional[Union[int, str]] = None,
+        max_cpu_memory: Optional[Union[int, str]] = None,
+        offload_folder: Optional[str] = "./offload",
+        gpus: Optional[int] = None,
+    ) -> dict:
+        """Get accelerate arguments - same as LLaDA2"""
+        num_local_processes = int(os.environ.get("LOCAL_WORLD_SIZE", 1))
+        if parallelize is None and gpus is not None and gpus > 1:
+            parallelize = True
+        args = {}
+        if parallelize:
+            max_memory_all_gpus = get_max_memory()
+            if "cpu" in max_memory_all_gpus:
+                del max_memory_all_gpus["cpu"]
+            max_memory_per_gpu_map = {
+                device_idx: max_memory_per_gpu for device_idx in range(len(max_memory_all_gpus))
+            } if max_memory_per_gpu is not None else {k: v for k, v in max_memory_all_gpus.items()}
+            if hasattr(self, "accelerator"):
+                max_memory_per_gpu_map = {
+                    k: v for k, v in max_memory_all_gpus.items()
+                    if k % num_local_processes == self.accelerator.process_index % num_local_processes
+                }
+            args["max_memory"] = max_memory_per_gpu_map
+            args["device_map"] = "auto"
+            args["offload_folder"] = offload_folder
+            if max_cpu_memory is not None:
+                args["max_memory"]["cpu"] = max_cpu_memory
+        else:
+            args["device_map"] = {"": str(self.device)}
+        return args
+    @property
+    def config(self):
+        return self._config
+    @property
+    def model(self):
+        if hasattr(self, "accelerator"):
+            return self.accelerator.unwrap_model(self._model)
+        else:
+            return self._model
+    @property
+    def eot_token_id(self):
+        return self.tokenizer.eos_token_id
+    @property
+    def prefix_token_id(self):
+        if self.custom_prefix_token_id is not None:
+            return self.custom_prefix_token_id
+        if self.tokenizer.bos_token_id is not None:
+            return self.tokenizer.bos_token_id
+        return self.tokenizer.eos_token_id
+    @property
+    def max_length(self):
+        if self._max_length:
+            return self._max_length
+        seqlen_config_attrs = ("n_positions", "max_position_embeddings", "n_ctx")
+        for attr in seqlen_config_attrs:
+            if hasattr(self.model.config, attr):
+                return getattr(self.model.config, attr)
+        if hasattr(self.tokenizer, "model_max_length"):
+            if self.tokenizer.model_max_length > 1e10:
+                return self._DEFAULT_MAX_LENGTH
+            return self.tokenizer.model_max_length
+        return self._DEFAULT_MAX_LENGTH
+    @property
+    def max_gen_toks(self) -> int:
+        return 256
+    @property
+    def batch_size(self):
+        return self.batch_size_per_gpu
+    @property
+    def device(self):
+        return self._device
+    @property
+    def tokenizer_name(self) -> str:
+        return self.tokenizer.name_or_path.replace("/", "__")
+    def _get_backend(self, config, backend, trust_remote_code):
+        """Get backend type - same as LLaDA2"""
+        assert backend in ["default", "causal", "seq2seq"]
+        if backend != "default":
+            self.backend = backend
+            eval_logger.info(f"Overrode HF model backend type, and using type '{self.backend}'")
+        else:
+            if getattr(config, "model_type") in MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES:
+                self.backend = "seq2seq"
+            elif getattr(self.config, "model_type") in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
+                self.backend = "causal"
+            else:
+                eval_logger.warning("HF model type is neither CausalLM nor Seq2SeqLM. Assuming CausalLM.")
+                self.backend = "causal"
+    def _get_config(self, pretrained, revision, trust_remote_code, gguf_file):
+        """Get model config - same as LLaDA2"""
+        self._config = transformers.AutoConfig.from_pretrained(
+            pretrained, revision=revision, trust_remote_code=trust_remote_code
+        )
+    def _create_model(self, pretrained, revision, dtype, trust_remote_code, parallelize,
+                     gpus, max_memory_per_gpu, max_cpu_memory, offload_folder,
+                     peft, delta, autogptq, gptqmodel, gguf_file, **kwargs):
+        """Create model - same as LLaDA2"""
+        if autogptq or gptqmodel:
+            raise NotImplementedError("Quantization options are not implemented.")
+        model_dtype = get_dtype(dtype)
+        eval_logger.info(f"Loading model with dtype: {model_dtype}")
+        model_kwargs = kwargs if kwargs else {}
+        if not parallelize:
+            model_kwargs.update(
+                self._get_accelerate_args(
+                    parallelize=parallelize,
+                    gpus=gpus,
+                    max_memory_per_gpu=max_memory_per_gpu,
+                    max_cpu_memory=max_cpu_memory,
+                    offload_folder=offload_folder
+                )
+            )
+        self._model = transformers.AutoModelForCausalLM.from_pretrained(
+            pretrained, revision=revision, torch_dtype=model_dtype,
+            trust_remote_code=trust_remote_code, **model_kwargs
+        )
+        if peft:
+            from peft import PeftModel
+            eval_logger.info(f"Loading PEFT model from {peft}")
+            self._model = PeftModel.from_pretrained(self._model, peft, torch_dtype=model_dtype)
+        if not parallelize:
+            self._model = self._model.to(self.device)
+        self._model = self._model.to(torch.bfloat16)
+        self._model.eval()
+    def _create_tokenizer(self, pretrained, tokenizer, revision, trust_remote_code,
+                         use_fast_tokenizer, gguf_file, add_bos_token):
+        """Create tokenizer - same as LLaDA2"""
+        kwargs = {
+            "revision": revision,
+            "trust_remote_code": trust_remote_code,
+            "use_fast": use_fast_tokenizer
+        }
+        if add_bos_token:
+            kwargs["add_bos_token"] = True
+        if tokenizer:
+            if isinstance(tokenizer, str):
+                self.tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer, **kwargs)
+            else:
+                self.tokenizer = tokenizer
+        else:
+            model_name = pretrained if isinstance(pretrained, str) else self.model.name_or_path
+            self.tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, **kwargs)
+    def tok_encode(self, string, left_truncate_len=None, add_special_tokens=None):
+        """Tokenize string - same as LLaDA2"""
+        special_tokens_kwargs = {}
+        if add_special_tokens is None:
+            if self.backend == "causal":
+                special_tokens_kwargs["add_special_tokens"] = self.add_bos_token
+        else:
+            special_tokens_kwargs["add_special_tokens"] = add_special_tokens
+        encoding = self.tokenizer.encode(string, **special_tokens_kwargs)
+        if left_truncate_len:
+            encoding = encoding[-left_truncate_len:]
+        return encoding
+    def tok_batch_encode(self, strings, padding_side="left", left_truncate_len=None, truncation=False):
+        """Batch tokenize - same as LLaDA2"""
+        old_padding_side = self.tokenizer.padding_side
+        self.tokenizer.padding_side = padding_side
+        add_special_tokens = {"add_special_tokens": self.add_bos_token} if self.backend == "causal" else {}
+        encoding = self.tokenizer(
+            strings, truncation=truncation, padding="longest",
+            return_tensors="pt", **add_special_tokens
+        )
+        if left_truncate_len and encoding["input_ids"].size(1) > left_truncate_len:
+            eval_logger.warning(f"Left-truncating from {encoding['input_ids'].size(1)} to {left_truncate_len} tokens.")
+            encoding["input_ids"] = encoding["input_ids"][:, -left_truncate_len:]
+            encoding["attention_mask"] = encoding["attention_mask"][:, -left_truncate_len:]
+        self.tokenizer.padding_side = old_padding_side
+        return encoding["input_ids"].to(self.device), encoding["attention_mask"].to(self.device)
+    def tok_decode(self, tokens, skip_special_tokens=False):
+        """Decode tokens - same as LLaDA2"""
+        return self.tokenizer.decode(tokens, skip_special_tokens=skip_special_tokens)
+    def _model_call(self, inps, attn_mask=None, labels=None):
+        """Model forward call - same as LLaDA2"""
+        with torch.no_grad():
+            if self.backend == "seq2seq":
+                return self.model(input_ids=inps, attention_mask=attn_mask, labels=labels).logits
+            else:
+                return self.model(inps, attention_mask=attn_mask).logits
+    def _loglikelihood_tokens(self, requests, **kwargs) -> List[Tuple[float, bool]]:
+        raise NotImplementedError
+    def loglikelihood_rolling(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[float]:
+        raise NotImplementedError
+    def loglikelihood(self, requests):
+        raise NotImplementedError
+    def generate_until(self, requests: List[Instance]) -> List[str]:
+        """Generate until - adapted for LLaDA v1 """
+        res = []
+        gen_kwargs = requests[0].args[1]
+        use_hts = gen_kwargs.get("use_hts", False)
+        realtime_output = gen_kwargs.get("realtime_output", "realtime_hts_results.jsonl")
+        baseline_realtime_output = gen_kwargs.get("realtime_output", "realtime_baseline_results.jsonl")
+        if not use_hts and "realtime_output" not in gen_kwargs:
+            baseline_realtime_output = "realtime_baseline_results.jsonl"
+        if not use_hts:
+            bar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Running Baseline (LLaDA v1)")
+            for req in requests:
+                prompt_text = req.args[0]
+                local_gen_kwargs = req.args[1] if len(req.args) > 1 else {}
+                context_enc, _ = self.tok_batch_encode([prompt_text])
+                final_codes, stats = self.hts_sampler.generate_hts(
+                    prompt_text=prompt_text,
+                    input_ids=context_enc,
+                    initial_N=1,
+                    final_K=1,
+                    hts_survivor_k=1,
+                    hts_mode=False,
+                    hts_start_pct=0.0,
+                    hts_end_pct=0.0,
+                    decay_factor=1.5,
+                    pruning_interval=0,
+                    reward_mode="confidence",
+                    task_type=local_gen_kwargs.get("task_type", "code"),
+                    steps=int(local_gen_kwargs.get("steps", 32)),
+                    gen_length=int(local_gen_kwargs.get("gen_length", 512)),
+                    block_length=int(local_gen_kwargs.get("block_length", 32)),
+                    temperature=float(local_gen_kwargs.get("temperature", 0.0)),
+                    top_p=float(local_gen_kwargs.get("top_p", 0.95)),
+                    top_k=local_gen_kwargs.get("top_k", None),
+                    threshold=float(local_gen_kwargs.get("threshold", 0.85)),
+                    mask_id=self.mask_id,
+                    eos_id=self.eot_token_id,
+                    until=local_gen_kwargs.get("until", []),
+                )
+                processed_codes = []
+                for code in final_codes:
+                    code = code.strip()
+                    if not self.escape_until:
+                        until_terms = local_gen_kwargs.get("until", [])
+                        for term in until_terms:
+                            if len(term) > 0 and term in code:
+                                code = code.split(term)[0]
+                    processed_codes.append(code)
+                final_choice = processed_codes[0] if processed_codes else ""
+                res.append(final_choice)
+                target_val = getattr(req, "target", None)
+                if target_val is None or target_val == "N/A":
+                    if "test" in req.doc and "entry_point" in req.doc:
+                        target_val = req.doc["test"] + "\ncheck(" + req.doc["entry_point"] + ")"
+                    else:
+                        target_val = req.doc.get("answer", req.doc.get("solution", "N/A"))
+                output_dir = os.path.dirname(baseline_realtime_output)
+                if output_dir:
+                    os.makedirs(output_dir, exist_ok=True)
+                with open(baseline_realtime_output, "a", encoding="utf-8") as f:
+                    all_resps = [[code] for code in processed_codes]
+                    output_data = {
+                        "doc": req.doc,
+                        "target": target_val,
+                        "resps": all_resps,
+                        "prompt": prompt_text,
+                        "entropy_history": stats.get("entropy_history", []),
+                        "pruning_history": stats.get("pruning_history", []),
+                        "final_scores": stats.get("final_scores", []),
+                        "all_trajectories": stats.get("all_trajectories", []),
+                        "nfe": stats.get("nfe", 0),
+                        "first_block_nfe": stats.get("first_block_nfe", 0),
+                        "svf_calls": stats.get("svf_calls", 0),
+                        "total_steps": stats.get("total_steps", 0),
+                        "num_gen_blocks": stats.get("num_gen_blocks", []),
+                        "steps_per_block": stats.get("steps_per_block", [])
+                    }
+                    f.write(json.dumps(output_data, ensure_ascii=False) + "\n")
+                    f.flush()
+                bar.update(1)
+            bar.close()
+        else:
+            bar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Running HTS+SVF (LLaDA v1)")
+            for req in requests:
+                prompt_text = req.args[0]
+                local_gen_kwargs = req.args[1] if len(req.args) > 1 else {}
+                context_enc, _ = self.tok_batch_encode([prompt_text])
+                p_interval = int(local_gen_kwargs.get("pruning_interval", 0))
+                final_codes, stats = self.hts_sampler.generate_hts(
+                    prompt_text=prompt_text,
+                    input_ids=context_enc,
+                    initial_N=int(local_gen_kwargs.get("hts_N", 4)),
+                    final_K=int(local_gen_kwargs.get("final_K", 1)),
+                    hts_survivor_k=int(local_gen_kwargs.get("hts_survivor_k", 4)),
+                    hts_mode=local_gen_kwargs.get("hts_mode", True),
+                    hts_start_pct=float(local_gen_kwargs.get("hts_start_pct", 0.1)),
+                    hts_end_pct=float(local_gen_kwargs.get("hts_end_pct", 0.6)),
+                    decay_factor=float(local_gen_kwargs.get("decay_factor", 1.5)),
+                    pruning_interval=p_interval,
+                    reward_mode=local_gen_kwargs.get("reward_mode", "svf"),
+                    task_type=local_gen_kwargs.get("task_type", "code"),
+                    steps=int(local_gen_kwargs.get("steps", 32)),
+                    gen_length=int(local_gen_kwargs.get("gen_length", 512)),
+                    block_length=int(local_gen_kwargs.get("block_length", 32)),
+                    temperature=float(local_gen_kwargs.get("temperature", 0.7)),
+                    top_p=float(local_gen_kwargs.get("top_p", 0.95)),
+                    top_k=local_gen_kwargs.get("top_k", None),
+                    threshold=float(local_gen_kwargs.get("threshold", 0.85)),
+                    mask_id=self.mask_id,
+                    eos_id=self.eot_token_id,
+                    until=local_gen_kwargs.get("until", []),
+                )
+                processed_codes = []
+                for code in final_codes:
+                    code = code.strip()
+                    if not self.escape_until:
+                        until_terms = local_gen_kwargs.get("until", [])
+                        for term in until_terms:
+                            if len(term) > 0 and term in code:
+                                code = code.split(term)[0]
+                    processed_codes.append(code)
+                final_choice = processed_codes[0]
+                res.append(final_choice)
+                target_val = getattr(req, "target", None)
+                if target_val is None or target_val == "N/A":
+                    if "test" in req.doc and "entry_point" in req.doc:
+                        target_val = req.doc["test"] + "\ncheck(" + req.doc["entry_point"] + ")"
+                    else:
+                        target_val = req.doc.get("answer", req.doc.get("solution", "N/A"))
+                output_dir = os.path.dirname(realtime_output)
+                if output_dir:
+                    os.makedirs(output_dir, exist_ok=True)
+                with open(realtime_output, "a", encoding="utf-8") as f:
+                    all_resps = [[code] for code in processed_codes]
+                    output_data = {
+                        "doc": req.doc,
+                        "target": target_val,
+                        "resps": all_resps,
+                        "prompt": prompt_text,
+                        "entropy_history": stats.get("entropy_history", []),
+                        "pruning_history": stats.get("pruning_history", []),
+                        "final_scores": stats.get("final_scores", []),
+                        "all_trajectories": stats.get("all_trajectories", []),
+                        "nfe": stats.get("nfe", 0),
+                        "first_block_nfe": stats.get("first_block_nfe", 0),
+                        "svf_calls": stats.get("svf_calls", 0),
+                        "total_steps": stats.get("total_steps", 0),
+                        "num_gen_blocks": stats.get("num_gen_blocks", []),
+                        "steps_per_block": stats.get("steps_per_block", [])
+                    }
+                    f.write(json.dumps(output_data, ensure_ascii=False) + "\n")
+                    f.flush()
+                bar.update(1)
+            bar.close()
+        return res
+    def apply_chat_template(
+        self, chat_history: List[Dict[str, str]], add_generation_prompt: bool = True
+    ) -> str:
+        """Apply chat template - same as LLaDA2"""
+        chat_templated = self.tokenizer.apply_chat_template(
+            chat_history, tokenize=False, add_generation_prompt=add_generation_prompt
+        )
+        if self.assistant_prefix:
+            chat_templated += self.assistant_prefix
+        return chat_templated

Prism/LLaDA/LLaDA_Baseline/dllm_eval/models/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from . import (
+    LLaDA,
+    huggingface,
+)
+# from .configuration_llada import LLaDAConfig
+# from .modeling_llada import LLaDAModelLM
+try:
+    # enable hf hub transfer if available
+    import hf_transfer  # type: ignore # noqa
+    import huggingface_hub.constants  # type: ignore
+    huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER = True
+except ImportError:
+    pass
+# __all__ = ['LLaDAConfig', 'LLaDAModelLM']

Prism/LLaDA/LLaDA_Baseline/dllm_eval/models/dummy.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import random
+from tqdm import tqdm
+from dllm_eval.api.model import LM
+from dllm_eval.api.registry import register_model
+@register_model("dummy")
+class DummyLM(LM):
+    def __init__(self) -> None:
+        super().__init__()
+    @classmethod
+    def create_from_arg_string(cls, arg_string, additional_config=None):
+        return cls()
+    def loglikelihood(self, requests, disable_tqdm: bool = False):
+        res = []
+        for _ in tqdm(requests, disable=disable_tqdm):
+            res.append((-random.random(), False))
+        return res
+    def generate_until(self, requests, disable_tqdm: bool = False):
+        res = []
+        for request in tqdm(requests, disable=disable_tqdm):
+            res.append("lol")
+            assert request.arguments[0].strip() != ""
+        return res
+    def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
+        res = []
+        for _ in tqdm(requests, disable=disable_tqdm):
+            res.append(-random.random())
+        return res

Prism/LLaDA/LLaDA_Baseline/dllm_eval/models/hts_sampler.py ADDED Viewed

	@@ -0,0 +1,315 @@

+import torch
+import torch.nn.functional as F
+import numpy as np
+from .verifier import CodeVerifier
+import logging
+import re
+import math
+logger = logging.getLogger(__name__)
+class HTSSampler:
+    def __init__(self, model, tokenizer, device="cuda"):
+        self.model = model
+        self.tokenizer = tokenizer
+        self.device = device
+        self.verifier = CodeVerifier(model, tokenizer, device)
+    def _get_num_transfer_tokens(self, block_length, steps):
+        if steps == 0: return torch.tensor([], dtype=torch.int64)
+        base = block_length // steps
+        remainder = block_length % steps
+        num_transfer_tokens = torch.full((steps,), base, dtype=torch.int64)
+        num_transfer_tokens[:remainder] += 1
+        return num_transfer_tokens
+    def _sample_with_temperature(self, logits, temperature, top_k, top_p):
+        logits = logits.to(torch.float32)
+        orig_probs = torch.softmax(logits, dim=-1)
+        x0_p, _ = torch.max(orig_probs, dim=-1)
+        if temperature > 0.0:
+            noise = torch.rand_like(logits, dtype=torch.float32)
+            gumbel_noise = -torch.log(-torch.log(noise + 1e-10) + 1e-10)
+            logits = logits / temperature + gumbel_noise
+        if top_k is not None and top_k > 0:
+            indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+            logits[indices_to_remove] = -float('Inf')
+        x0 = torch.argmax(logits, dim=-1)
+        return x0, x0_p
+    def _safe_scalar(self, val):
+        if isinstance(val, torch.Tensor):
+            if val.numel() > 1: return val.mean().item()
+            return val.item()
+        return float(val)
+    def _analyze_structure(self, text, task_type="code"):
+        score = 0.0
+        stripped = text.strip()
+        if task_type == "code":
+            if len(stripped) < 5: return -0.1
+            keywords = ["return", "print", "yield", "lambda", "class ", "def "]
+            if any(k in stripped for k in keywords): score += 0.05
+            if ":" in stripped: score += 0.02
+            if "    " in text: score += 0.03
+        elif task_type == "math":
+            if "\\boxed{" in stripped: score += 0.1
+            if "The answer is" in stripped: score += 0.05
+            if len(stripped) < 10: return -0.1
+            if "Step" in stripped and stripped.count("Step") > 15: score -= 0.2
+        return score
+    def _chunked_forward(self, x, chunk_size=96, slice_indices=None):
+        total_batch = x.shape[0]
+        logits_list = []
+        for i in range(0, total_batch, chunk_size):
+            end_idx = min(i + chunk_size, total_batch)
+            sub_x = x[i:end_idx]
+            sub_mask = torch.ones_like(sub_x, device=self.device)
+            with torch.no_grad():
+                outputs = self.model(input_ids=sub_x, attention_mask=sub_mask)
+                sub_logits = outputs.logits
+                if slice_indices is not None:
+                    s_start, s_end = slice_indices
+                    sub_logits = sub_logits[:, s_start:s_end, :]
+                logits_list.append(sub_logits.detach().clone())
+        return torch.cat(logits_list, dim=0)
+    def _branch_and_resample(self, x, conf_scores, survivor_indices, target_width, mask_id,
+                             prompt_length, resample_window=5, task_type="code"):
+        num_survivors = len(survivor_indices)
+        if num_survivors == 0: return x[:target_width].clone(), conf_scores[:target_width].clone()
+        if task_type == "math": resample_window = 6
+        elif task_type == "reasoning": resample_window = 6
+        elif task_type == "code": resample_window = 6
+        base_repeat = target_width // num_survivors
+        remainder = target_width % num_survivors
+        new_x_list = []
+        new_conf_list = []
+        for i in range(num_survivors):
+            count = base_repeat + (1 if i < remainder else 0)
+            if count == 0: continue
+            survivor_x = x[survivor_indices[i]]
+            survivor_conf = conf_scores[survivor_indices[i]]
+            new_x_list.append(survivor_x.unsqueeze(0))
+            new_conf_list.append(survivor_conf.unsqueeze(0))
+            if count > 1:
+                gen_part = survivor_x[prompt_length:]
+                gen_conf = survivor_conf[prompt_length:]
+                non_mask_indices = (gen_part != mask_id).nonzero(as_tuple=True)[0]
+                for _ in range(count - 1):
+                    perturbed_x = survivor_x.clone()
+                    perturbed_conf = survivor_conf.clone()
+                    if len(non_mask_indices) > 0:
+                        pool_size = min(resample_window * 2, len(non_mask_indices))
+                        current_token_confs = gen_conf[non_mask_indices]
+                        _, candidate_indices = torch.topk(current_token_confs, k=pool_size, largest=False)
+                        num_to_perturb = min(resample_window, pool_size)
+                        rand_indices = torch.randperm(pool_size, device=self.device)[:num_to_perturb]
+                        selected_sub_indices = candidate_indices[rand_indices]
+                        target_indices_in_x = prompt_length + non_mask_indices[selected_sub_indices]
+                        perturbed_x[target_indices_in_x] = mask_id
+                        perturbed_conf[target_indices_in_x] = 0.0
+                    new_x_list.append(perturbed_x.unsqueeze(0))
+                    new_conf_list.append(perturbed_conf.unsqueeze(0))
+        return torch.cat(new_x_list, dim=0), torch.cat(new_conf_list, dim=0)
+    @torch.no_grad()
+    def generate_hts(self, prompt_text, input_ids, problem_data=None,
+                     initial_N=1, final_K=1, survivor_K=None,
+                     prune_step_pct=0.0, reward_mode="confidence",
+                     temperature=0.7, block_length=32, steps=64, gen_length=1024,
+                     top_p=0.95, top_k=None, minimal_topk=1, threshold=0.9,
+                     eos_id=156892, mask_id=156895,
+                     hts_mode=False, hts_start_pct=0.1, hts_end_pct=0.6, decay_factor=1.5,
+                     hts_survivor_k=4, task_type="code", until=None, pruning_interval=0):
+        input_ids = input_ids.to(self.device)
+        if input_ids.shape[0] == 1: input_ids = input_ids.repeat(initial_N, 1)
+        schedule_map = {}
+        ts_start, tr_end = 0, 0
+        if not hts_mode:
+            final_K_list = [final_K] if not isinstance(final_K, list) else final_K
+            prune_pct_list = [prune_step_pct] if not isinstance(prune_step_pct, list) else prune_step_pct
+            survivor_K_list = final_K_list if survivor_K is None else ([survivor_K] if not isinstance(survivor_K, list) else survivor_K)
+            if len(survivor_K_list) < len(final_K_list): survivor_K_list.extend(final_K_list[len(survivor_K_list):])
+            for pct, width, parents in zip(prune_pct_list, final_K_list, survivor_K_list):
+                if pct > 0:
+                    s = int(steps * pct)
+                    schedule_map[s] = (width, parents)
+        else:
+            final_K_list = [final_K] if not isinstance(final_K, int) else [final_K]
+            ts_start, tr_end = int(steps * hts_start_pct), int(steps * hts_end_pct)
+        prompt_length = input_ids.shape[1]
+        num_blocks = (gen_length + block_length - 1) // block_length
+        total_length = prompt_length + num_blocks * block_length
+        x = torch.full((initial_N, total_length), mask_id, dtype=torch.long, device=self.device)
+        x[:, :prompt_length] = input_ids.clone()
+        conf_scores = torch.zeros((initial_N, total_length), dtype=torch.float32, device=self.device)
+        conf_scores[:, :prompt_length] = 1.0
+        prefill_blocks = 0
+        num_gen_blocks = num_blocks
+        current_bsz = initial_N
+        next_allowed_pruning_step = ts_start if hts_mode else 0
+        stats = {
+            "initial_n": initial_N, "final_k": final_K_list[-1],
+            "pruning_history": [], "entropy_history": [], "nfe": 0.0,
+            "svf_calls": 0, "final_scores": [], "total_steps": steps,
+            "first_block_nfe": 0.0, "num_gen_blocks": [], "steps_per_block": []
+        }
+        for num_block in range(num_gen_blocks):
+            stats["num_gen_blocks"].append(num_block)
+            window_start = prompt_length + num_block * block_length
+            window_end = window_start + block_length
+            schedule = self._get_num_transfer_tokens(block_length, steps)
+            steps_this_block = 0
+            for step in range(steps):
+                steps_this_block += 1
+                cur_full_x = x[:current_bsz, :]
+                perform_pruning = False
+                num_parents_to_select = 0
+                if hts_mode and step >= next_allowed_pruning_step and step < tr_end:
+                    target_width = max(final_K_list[-1], math.ceil(initial_N * (decay_factor ** -(step - ts_start))))
+                    if current_bsz > target_width:
+                        perform_pruning = True
+                        num_parents_to_select = hts_survivor_k
+                elif not hts_mode and step in schedule_map:
+                    target_width, num_parents_to_select = schedule_map[step]
+                    if current_bsz > target_width: perform_pruning = True
+                if perform_pruning:
+                    stats["nfe"] += current_bsz
+                    if num_block == 0: stats["first_block_nfe"] += current_bsz
+                    stats["svf_calls"] += current_bsz
+                    gen_logits = self._chunked_forward(cur_full_x, chunk_size=64, slice_indices=(prompt_length, total_length))
+                    rough_ids = torch.argmax(gen_logits, dim=-1)
+                    rough_codes_snippet = self.tokenizer.batch_decode(rough_ids, skip_special_tokens=True)
+                    candidates = []
+                    for i in range(current_bsz):
+                        full_code = rough_codes_snippet[i]
+                        s = self._safe_scalar(self.verifier.get_reward(prompt_text, full_code, mode=reward_mode, problem_data=problem_data, current_logits=gen_logits[i] if reward_mode != "svf" else None, task_type=task_type))
+                        s += self._analyze_structure(full_code, task_type=task_type)
+                        clean_content = full_code.strip().replace(" ", "").replace("\n", "")
+                        candidates.append({'score': s, 'idx': i, 'key': hash(clean_content[:200] + clean_content[-200:])})
+                    stats["pruning_history"].append({"step": step, "scores": [c['score'] for c in candidates]})
+                    candidates.sort(key=lambda x: x['score'], reverse=True)
+                    selected_indices, seen_keys = [], set()
+                    for cand in candidates:
+                        if len(selected_indices) >= num_parents_to_select: break
+                        if cand['key'] not in seen_keys:
+                            selected_indices.append(cand['idx']); seen_keys.add(cand['key'])
+                    if len(selected_indices) < num_parents_to_select:
+                        for cand in candidates:
+                            if len(selected_indices) >= num_parents_to_select: break
+                            if cand['idx'] not in selected_indices: selected_indices.append(cand['idx'])
+                    top_indices = torch.tensor(selected_indices, device=self.device)
+                    x, conf_scores = self._branch_and_resample(x, conf_scores, top_indices, target_width, mask_id, prompt_length, task_type=task_type)
+                    current_bsz = target_width
+                    cur_full_x = x[:current_bsz, :]
+                    next_allowed_pruning_step = step + 1 + pruning_interval
+                stats["nfe"] += current_bsz
+                if num_block == 0: stats["first_block_nfe"] += current_bsz
+                active_logits = self._chunked_forward(cur_full_x, chunk_size=32, slice_indices=(window_start, window_end))
+                active_logits[:, :, eos_id] = -1e10
+                x0, x0_p = self._sample_with_temperature(active_logits, temperature, top_k, top_p)
+                active_mask = x[:current_bsz, window_start:window_end] == mask_id
+                num_transfer = schedule[step].item()
+                confidence = torch.where(active_mask, x0_p, -torch.inf)
+                transfer_idx = torch.zeros_like(x0, dtype=torch.bool)
+                for b in range(current_bsz):
+                    mask_count = active_mask[b].sum().item()
+                    if mask_count > 0:
+                        k_transfer = min(num_transfer, mask_count)
+                        active_indices = torch.where(active_mask[b])[0]
+                        high_conf_mask = (confidence[b] > threshold) & active_mask[b]
+                        if high_conf_mask.sum().item() >= k_transfer:
+                            conf_indices = torch.where(high_conf_mask)[0]
+                            transfer_idx[b, conf_indices] = True
+                        else:
+                            _, topk_indices = torch.topk(confidence[b][active_indices], k=min(k_transfer, len(active_indices)))
+                            transfer_idx[b, active_indices[topk_indices]] = True
+                if transfer_idx.any():
+                    x[:current_bsz, window_start:window_end][transfer_idx] = x0[transfer_idx]
+                    conf_scores[:current_bsz, window_start:window_end][transfer_idx] = x0_p[transfer_idx]
+                if task_type in ["math", "reasoning"]:
+                    for b in range(current_bsz):
+                        text_snippet = self.tokenizer.decode(x[b, prompt_length:window_end], skip_special_tokens=True)
+                        should_stop = False
+                        if task_type == "reasoning" and ("###" in text_snippet): should_stop = True
+                        if task_type == "math" and ("\\boxed{" in text_snippet and "}" in text_snippet.split("\\boxed{")[-1]): should_stop = True
+                        if should_stop:
+                            after_mask = (x[b, window_start:total_length] == mask_id)
+                            x[b, window_start:total_length][after_mask] = eos_id
+            stats["steps_per_block"].append(steps_this_block)
+            x = x[:current_bsz]
+        stats["nfe"] = int(round(stats["nfe"]))
+        stats["first_block_nfe"] = int(round(stats["first_block_nfe"]))
+        final_gen_tokens = x[:current_bsz, prompt_length:]
+        final_codes = self.tokenizer.batch_decode(final_gen_tokens, skip_special_tokens=True)
+        final_candidates = []
+        stats["svf_calls"] += len(final_codes)
+        for i in range(len(final_codes)):
+            txt = final_codes[i]
+            if until:
+                for term in until:
+                    if term in txt: txt = txt.split(term)[0]
+            s = self._safe_scalar(self.verifier.get_reward(prompt_text, txt, mode=reward_mode, task_type=task_type))
+            s += self._analyze_structure(txt, task_type)
+            final_candidates.append({'resp': txt, 'score': s})
+        final_candidates.sort(key=lambda x: x['score'], reverse=True)
+        stats["final_scores"] = [c['score'] for c in final_candidates]
+        stats["all_trajectories"] = [{"rank": i+1, "resp": c['resp'], "score": c['score']} for i, c in enumerate(final_candidates)]
+        return [c['resp'] for c in final_candidates], stats

Prism/LLaDA/LLaDA_Baseline/dllm_eval/models/huggingface.py ADDED Viewed

	@@ -0,0 +1,1489 @@

+import copy
+import logging
+import os
+from datetime import timedelta
+from pathlib import Path
+from typing import Any, Dict, List, Literal, Optional, Tuple, Union
+import jinja2
+import torch
+import torch.nn.functional as F
+import transformers
+from accelerate import (
+    Accelerator,
+    InitProcessGroupKwargs,
+    find_executable_batch_size,
+)
+from accelerate.utils import get_max_memory
+from huggingface_hub import HfApi
+from packaging import version
+from peft import PeftModel
+from peft import __version__ as PEFT_VERSION
+from tqdm import tqdm
+from transformers.models.auto.modeling_auto import (
+    MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
+    MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
+)
+from dllm_eval import utils
+from dllm_eval.api.instance import Instance
+from dllm_eval.api.model import TemplateLM
+from dllm_eval.api.registry import register_model
+from dllm_eval.models.utils import (
+    Collator,
+    clear_torch_cache,
+    configure_pad_token,
+    get_dtype,
+    handle_stop_sequences,
+    pad_and_concat,
+    stop_sequences_criteria,
+)
+eval_logger = logging.getLogger(__name__)
+@register_model("hf-auto", "hf", "huggingface")
+class HFLM(TemplateLM):
+    """
+    An abstracted Huggingface model class. Enables usage with both models of
+    `transformers.AutoModelForCausalLM` and `transformers.AutoModelForSeq2SeqLM` classes.
+    Supports data-parallel multi-GPU with HF Accelerate.
+    """
+    AUTO_MODEL_CLASS = None
+    _DEFAULT_MAX_LENGTH = 2048
+    def __init__(
+        self,
+        pretrained: Union[str, transformers.PreTrainedModel],
+        backend: Literal["default", "causal", "seq2seq"] = "default",
+        # override whether the model should be treated as decoder-only (causal) or encoder-decoder (seq2seq)
+        revision: Optional[str] = "main",
+        subfolder: str = "",
+        tokenizer: Optional[
+            Union[
+                str,
+                transformers.PreTrainedTokenizer,
+                transformers.PreTrainedTokenizerFast,
+            ]
+        ] = None,
+        truncation: Optional[bool] = False,
+        logits_cache: bool = True,
+        max_length: Optional[int] = None,
+        device: Optional[str] = "cuda",
+        dtype: Optional[Union[str, torch.dtype]] = "auto",
+        softmax_dtype: Optional[Union[str, torch.dtype]] = None,
+        batch_size: Optional[Union[int, str]] = 1,
+        max_batch_size: Optional[int] = 64,
+        trust_remote_code: Optional[bool] = False,
+        use_fast_tokenizer: Optional[bool] = True,
+        add_bos_token: Optional[bool] = False,
+        prefix_token_id: Optional[int] = None,
+        # arguments used for splitting a model across GPUs naively.
+        # only used if `parallelize=True`.
+        parallelize: Optional[bool] = False,
+        max_memory_per_gpu: Optional[Union[int, str]] = None,
+        max_cpu_memory: Optional[Union[int, str]] = None,
+        offload_folder: Optional[Union[str, os.PathLike]] = "./offload",
+        # PEFT, delta weights and quantization options
+        peft: Optional[str] = None,
+        delta: Optional[str] = None,
+        autogptq: Optional[Union[bool, str]] = False,
+        gptqmodel: Optional[bool] = False,
+        gguf_file: Optional[str] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        # optionally: take in an already-initialized transformers.PreTrainedModel
+        if not isinstance(pretrained, str):
+            eval_logger.warning(
+                "`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way."
+            )
+            assert not parallelize, (
+                "`parallelize=True` is not compatible with passing pre-initialized model to `pretrained`"
+            )
+            self._model = pretrained
+            self._device = self._model.device
+            self._config = self._model.config
+            gpus = 0
+        else:
+            assert isinstance(device, str)
+            assert isinstance(pretrained, str)
+            assert isinstance(batch_size, (int, str))
+            gpus = torch.cuda.device_count()
+            accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52))
+            accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs])
+            if accelerator.num_processes > 1:
+                self.accelerator = accelerator
+            if "npu" in accelerator.device.type:
+                gpus = torch.npu.device_count()
+            # using one process with no model parallelism
+            if not (parallelize or accelerator.num_processes > 1):
+                # use user-passed device
+                device_list = set(
+                    ["cuda", "cpu"]
+                    + [f"cuda:{i}" for i in range(gpus)]
+                    + ["mps", "mps:0"]
+                    + [f"npu:{i}" for i in range(gpus)]
+                )
+                if device and device in device_list:
+                    self._device = torch.device(device)
+                    eval_logger.info(f"Using device '{device}'")
+                    if device in ("mps", "mps:0") and version.parse(
+                        torch.__version__
+                    ) < version.parse("2.1"):
+                        raise RuntimeError(
+                            f"mps requires torch >= 2.1. You have {torch.__version__}"
+                        )
+                else:
+                    eval_logger.info("Device not specified")
+                    eval_logger.info(f"Cuda Available? {torch.cuda.is_available()}")
+                    self._device = (
+                        torch.device("cuda")
+                        if torch.cuda.is_available()
+                        else torch.device("cpu")
+                    )
+            else:  # Parallelism managed by accelerate
+                if device != "cuda":
+                    eval_logger.info(
+                        f"Using `accelerate launch` or `parallelize=True`, device '{device}' will be overridden when placing model."
+                    )
+                # TODO: include in warning that `load_in_8bit` etc. affect this too
+                self._device = (
+                    self.accelerator.device
+                    if hasattr(self, "accelerator")
+                    else torch.device(device)
+                )
+            revision = str(revision)  # cast to string if not already one
+            self._get_config(
+                pretrained,
+                revision=revision,
+                trust_remote_code=trust_remote_code,
+                gguf_file=gguf_file,
+                subfolder=subfolder,
+            )
+            # determine which of 'causal' and 'seq2seq' backends to use for HF models
+        self._get_backend(
+            config=self.config, backend=backend, trust_remote_code=trust_remote_code
+        )
+        # load tokenizer so we know tokenizer vocabulary size before loading model and PEFT
+        self._create_tokenizer(
+            pretrained,
+            tokenizer,
+            revision=revision,
+            subfolder=subfolder,
+            trust_remote_code=trust_remote_code,
+            use_fast_tokenizer=use_fast_tokenizer,
+            gguf_file=gguf_file,
+            add_bos_token=add_bos_token,
+        )
+        # if we passed `pretrained` as a string, initialize our model now
+        if isinstance(pretrained, str):
+            self._create_model(
+                pretrained=pretrained,
+                revision=revision,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+                parallelize=parallelize,
+                gpus=gpus,
+                max_memory_per_gpu=max_memory_per_gpu,
+                max_cpu_memory=max_cpu_memory,
+                offload_folder=offload_folder,
+                peft=peft,
+                delta=delta,
+                autogptq=autogptq,
+                gptqmodel=gptqmodel,
+                gguf_file=gguf_file,
+                quantization_config=getattr(self.config, "quantization_config", None),
+                subfolder=subfolder,
+                **kwargs,
+            )
+        # access self._model through self.model property outside this method
+        if isinstance(self.model, torch.nn.Module):
+            self.model.eval()
+            self.model.tie_weights()
+        self.truncation = truncation
+        self.logits_cache = logits_cache
+        self.vocab_size = self.tokenizer.vocab_size
+        # select (or create) a pad token to use
+        self.tokenizer = configure_pad_token(self.tokenizer, model_config=self.config)
+        self.add_bos_token = add_bos_token
+        if "gemma" in getattr(self.config, "model_type", ""):
+            self.add_bos_token = True
+            eval_logger.info(
+                f"Model type is '{self.config.model_type}', part of the Gemma family--a BOS token will be used as Gemma underperforms without it."
+            )
+        self._max_length = max_length
+        self.pretrained = pretrained
+        self.delta = delta
+        self.peft = peft
+        self.revision = revision
+        self.batch_schedule = 1
+        self.batch_sizes = {}
+        self.max_batch_size = max_batch_size
+        self.softmax_dtype = (
+            get_dtype(softmax_dtype) if softmax_dtype is not None else None
+        )
+        if str(batch_size).startswith("auto"):
+            batch_size = batch_size.split(":")
+            self.batch_size_per_gpu = batch_size[0]
+            self.batch_schedule = float(batch_size[1]) if len(batch_size) > 1 else 1
+        else:
+            self.batch_size_per_gpu = int(batch_size)
+        if isinstance(pretrained, str):
+            if gpus >= 1 or str(self.device) == "mps":
+                # TODO: can remove this whole snippet except in the mps case, perhaps?
+                if not (parallelize or autogptq or hasattr(self, "accelerator")):
+                    # place model onto device requested manually,
+                    # if not using HF Accelerate or device_map
+                    # or any other option that preloads model onto device
+                    try:
+                        self.model.to(self.device)
+                    except ValueError:
+                        eval_logger.debug(
+                            "Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes` or `device_map` is provided. If the desired GPU is being used, this message is safe to ignore."
+                        )
+            # multigpu data-parallel support when launched with accelerate
+            if gpus > 1:
+                if accelerator.num_processes > 1:
+                    if parallelize:
+                        eval_logger.warning(
+                            "You are both using a HF Accelerate `device_map` (`--model_args parallelize=True`) and launching via `accelerate launch`. This will attempt to do model and data parallelism depending on the resources available."
+                        )
+                    elif gpus > accelerator.num_processes:
+                        eval_logger.warning(
+                            "WARNING: The number of total system GPUs does not match the number of spawned processes. "
+                            "If you would like to use data parallelism, please launch the script "
+                            "with 'accelerate launch *script*'. "
+                            f"Current run will proceed with {accelerator.num_processes} devices."
+                        )
+                        if self.accelerator.is_local_main_process:
+                            eval_logger.info(
+                                f"Using {gpus} devices with data parallelism"
+                            )
+                    self._device = torch.device(f"{accelerator.device}")
+                    self.accelerator = accelerator
+                    self._rank = self.accelerator.local_process_index
+                    self._world_size = self.accelerator.num_processes
+                else:
+                    # if we aren't launching via accelerate, ditch
+                    self._rank = 0
+                    self._world_size = 1
+        else:
+            # if a PreTrainedModel was passed into HFLM, we forgo distributed setup.
+            eval_logger.warning(
+                "Passed an already-initialized model through `pretrained`, assuming single-process call to evaluate() or custom distributed integration"
+            )
+            self._rank = 0
+            self._world_size = 1
+        self.custom_prefix_token_id = prefix_token_id
+        if prefix_token_id is not None:
+            eval_logger.info(
+                f"Loglikelihood prefix token id used in evaluation: {self.prefix_token_id}"
+            )
+    def _get_accelerate_args(
+        self,
+        parallelize: Optional[bool] = None,
+        device_map: Optional[str] = "auto",
+        max_memory_per_gpu: Optional[Union[int, str]] = None,
+        max_cpu_memory: Optional[Union[int, str]] = None,
+        offload_folder: Optional[str] = "./offload",
+        gpus: Optional[int] = None,
+    ) -> dict:
+        """Returns the kwargs needed to apply `accelerate` in `AutoModel.from_pretrained`."""
+        num_local_processes = int(os.environ.get("LOCAL_WORLD_SIZE", 1))
+        num_machines = int(os.environ.get("WORLD_SIZE", 0)) // num_local_processes
+        if (
+            num_machines == 0
+            and hasattr(self, "accelerator")
+            and self.accelerator is not None
+        ):
+            eval_logger.info(
+                "We are not in a distributed setting for accelerate. Setting model_parallel to False."
+            )
+            parallelize = False
+        if parallelize is None:
+            # If parallelism is unset by the user, we automatically assign model parallelism
+            # if enough extra GPUs are available
+            max_memory_all_gpus = get_max_memory()
+            # We just want gpu, not cpu, max memory
+            if "cpu" in max_memory_all_gpus:
+                del max_memory_all_gpus["cpu"]
+            parallelize = bool(num_local_processes < len(max_memory_all_gpus))
+            eval_logger.info(
+                f"Setting model parallel to {parallelize} since "
+                f"the number of local processes is {num_local_processes} "
+                f"and the number of GPUs is {len(max_memory_all_gpus)}"
+            )
+        args = {}
+        if parallelize:  # Model parallelism will be used
+            max_memory = {}
+            if max_memory_per_gpu is not None:  # Using the provided memory requirements
+                max_memory_per_gpu_map = {
+                    device_idx: max_memory_per_gpu for device_idx in range(gpus)
+                }
+            else:  # Estimating the possible memory requirements
+                max_memory_all_gpus = get_max_memory()
+                if "cpu" in max_memory_all_gpus:
+                    del max_memory_all_gpus["cpu"]
+                if not hasattr(self, "accelerator"):
+                    max_memory_per_gpu_map = {
+                        k: v for k, v in max_memory_all_gpus.items()
+                    }
+                else:
+                    # use only 1 / num_processes of the GPUs if we are running under accelerate launch
+                    max_memory_per_gpu_map = {
+                        k: v
+                        for k, v in max_memory_all_gpus.items()
+                        if k % num_local_processes
+                        == (self.accelerator.process_index % num_local_processes)
+                    }
+            args["max_memory"] = max_memory_per_gpu_map
+            args["device_map"] = "auto" if device_map is None else device_map
+            eval_logger.info(
+                f"Model parallel was set to True, setting max memory per GPU to {max_memory_per_gpu_map} and device map to {args.get('device_map')}"
+            )
+            if max_cpu_memory is not None:
+                max_memory["cpu"] = max_cpu_memory
+            args["offload_folder"] = offload_folder
+        elif (
+            device_map is None
+        ):  # No model parallelism, we use the default provided device for our model
+            if hasattr(self, "accelerator"):
+                device_map = {"": f"{self.accelerator.device}"}
+            else:
+                device_map = {"": str(self.device)}
+            args["max_memory"] = None
+            args["device_map"] = device_map
+            eval_logger.info(
+                f"Model parallel was set to False, max memory was not set, and device map was set to {device_map}"
+            )
+        else:
+            args["max_memory"] = None
+            args["device_map"] = None
+            eval_logger.info("Model parallel was set to False.")
+        return args
+    @property
+    def config(self):
+        # return the associated transformers.AutoConfig for the given pretrained model.
+        return self._config
+    @property
+    def model(self):
+        # returns the model, unwrapping it if using Accelerate
+        if hasattr(self, "accelerator"):
+            return self.accelerator.unwrap_model(self._model)
+        else:
+            return self._model
+    @property
+    def eot_token_id(self):
+        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
+        return self.tokenizer.eos_token_id
+    @property
+    def prefix_token_id(self):
+        # it is used as prefix for loglikelihood
+        if self.custom_prefix_token_id is not None:
+            return self.custom_prefix_token_id
+        if self.tokenizer.bos_token_id is not None:
+            return self.tokenizer.bos_token_id
+        return self.tokenizer.eos_token_id
+    @property
+    def max_length(self):
+        if self._max_length:  # if max length manually set, return it
+            return self._max_length
+        seqlen_config_attrs = ("n_positions", "max_position_embeddings", "n_ctx")
+        for attr in seqlen_config_attrs:
+            if hasattr(self.model.config, attr):
+                return getattr(self.model.config, attr)
+        if hasattr(self.tokenizer, "model_max_length"):
+            if self.tokenizer.model_max_length == 1000000000000000019884624838656:
+                return self._DEFAULT_MAX_LENGTH
+            return self.tokenizer.model_max_length
+        return self._DEFAULT_MAX_LENGTH
+    @property
+    def max_gen_toks(self) -> int:
+        return 256
+    @property
+    def batch_size(self):
+        return self.batch_size_per_gpu
+    @property
+    def device(self):
+        return self._device
+    @property
+    def rank(self):
+        return self._rank
+    @property
+    def world_size(self):
+        return self._world_size
+    @property
+    def tokenizer_name(self) -> str:
+        return self.tokenizer.name_or_path.replace("/", "__")
+    def _get_backend(
+        self,
+        config: Union[transformers.PretrainedConfig, transformers.AutoConfig],
+        backend: Literal["default", "causal", "seq2seq"] = "default",
+        trust_remote_code: Optional[bool] = False,
+    ) -> None:
+        """
+        Helper method during initialization.
+        Determines the backend ("causal" (decoder-only) or "seq2seq" (encoder-decoder)) model type to be used.
+        sets `self.AUTO_MODEL_CLASS` appropriately if not already set.
+        **If not calling HFLM.__init__() or HFLM._get_backend() within a subclass of HFLM,
+        user must set `self.backend` to be either "causal" or "seq2seq" manually!**
+        """
+        assert backend in ["default", "causal", "seq2seq"]
+        if backend != "default":
+            # if we've settled on non-default backend, use that manually
+            if backend == "causal":
+                self.backend = backend
+            elif backend == "seq2seq":
+                self.backend = backend
+            eval_logger.info(
+                f"Overrode HF model backend type, and using type '{self.backend}'"
+            )
+        else:
+            # determine and use the default HF backend for this model, based on its config + metadata.
+            if (
+                getattr(config, "model_type")
+                in MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
+            ):
+                # first check if model type is listed under seq2seq models, since some
+                # models like MBart are listed in both seq2seq and causal mistakenly in HF transformers.
+                # these special cases should be treated as seq2seq models.
+                self.backend = "seq2seq"
+                eval_logger.debug(f"Using model type '{self.backend}'")
+            elif (
+                getattr(self.config, "model_type") in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+            ):
+                self.backend = "causal"
+                eval_logger.debug(f"Using model type '{self.backend}'")
+            else:
+                if not trust_remote_code:
+                    eval_logger.warning(
+                        "HF model type is neither marked as CausalLM or Seq2SeqLM. \
+                    This is expected if your model requires `trust_remote_code=True` but may be an error otherwise."
+                        "Setting backend to causal"
+                    )
+                # if model type is neither in HF transformers causal or seq2seq model registries
+                # then we default to assuming AutoModelForCausalLM
+                self.backend = "causal"
+                eval_logger.info(
+                    f"Model type cannot be determined. Using default model type '{self.backend}'"
+                )
+        if self.AUTO_MODEL_CLASS is None:
+            if self.backend == "causal":
+                self.AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM
+            elif self.backend == "seq2seq":
+                self.AUTO_MODEL_CLASS = transformers.AutoModelForSeq2SeqLM
+    def _get_config(
+        self,
+        pretrained: str,
+        revision: str = "main",
+        trust_remote_code: bool = False,
+        gguf_file: Optional[str] = None,
+        subfolder: str = "",
+    ) -> None:
+        """Return the model config for HuggingFace models"""
+        self._config = transformers.AutoConfig.from_pretrained(
+            pretrained,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+            gguf_file=gguf_file,
+            subfolder=subfolder,
+        )
+    def _create_model(
+        self,
+        pretrained: str,
+        revision: Optional[str] = "main",
+        dtype: Optional[Union[str, torch.dtype]] = "auto",
+        trust_remote_code: Optional[bool] = False,
+        # arguments used for splitting a model across GPUs naively.
+        # only used if `parallelize=True`.
+        # (accelerate naive PP (device_map) options)
+        parallelize: Optional[bool] = False,
+        gpus: Optional[int] = None,
+        max_memory_per_gpu: Optional[Union[int, str]] = None,
+        max_cpu_memory: Optional[Union[int, str]] = None,
+        offload_folder: Optional[str] = "./offload",
+        # PEFT, delta weights and quantization options
+        peft: Optional[str] = None,
+        delta: Optional[str] = None,
+        autogptq: Optional[Union[bool, str]] = False,
+        gptqmodel: Optional[bool] = False,
+        gguf_file: Optional[str] = None,
+        quantization_config: Optional[Dict[str, Any]] = None,
+        subfolder: str = "",
+        **kwargs,
+    ) -> None:
+        """
+        Initializes an HF or HF-compatible PreTrainedModel from scratch
+        inside HFLM, using the kwargs passed into self.__init__().
+        Also handles functionality such as AutoGPTQ usage and PEFT wrapping.
+        For future similar extensions to AutoGPTQ that are not core to HF's ecosystem,
+        (such as PyTorch models that are nearly, but not quite, fully mirroring
+        HF's public interface relied on in this HFLM class)
+        please consider subclassing HFLM and overriding this and other methods as needed.
+        """
+        model_kwargs = kwargs if kwargs else {}
+        model_kwargs.update(
+            self._get_accelerate_args(
+                parallelize=parallelize,
+                device_map=kwargs.get("device_map", None),
+                max_memory_per_gpu=max_memory_per_gpu,
+                max_cpu_memory=max_cpu_memory,
+                offload_folder=offload_folder,
+                gpus=gpus,
+            )
+        )
+        if not autogptq and not gptqmodel:
+            if model_kwargs.get("load_in_4bit", None):
+                assert transformers.__version__ >= "4.30.0", (
+                    "load_in_4bit requires transformers >= 4.30.0"
+                )
+            if transformers.__version__ >= "4.30.0":
+                if model_kwargs.get("load_in_4bit", None):
+                    if model_kwargs.get("bnb_4bit_compute_dtype", None):
+                        model_kwargs["bnb_4bit_compute_dtype"] = get_dtype(
+                            model_kwargs["bnb_4bit_compute_dtype"]
+                        )
+            self._model = self.AUTO_MODEL_CLASS.from_pretrained(
+                pretrained,
+                revision=revision,
+                torch_dtype=get_dtype(dtype),
+                trust_remote_code=trust_remote_code,
+                gguf_file=gguf_file,
+                quantization_config=quantization_config,
+                subfolder=subfolder,
+                **model_kwargs,
+            )
+        else:
+            if autogptq and gptqmodel:
+                raise ValueError(
+                    "Cannot use both 'autogptq' and 'gptqmodel' options at the same time."
+                )
+            if autogptq:
+                try:
+                    from auto_gptq import AutoGPTQForCausalLM
+                except ModuleNotFoundError as exception:
+                    raise type(exception)(
+                        "Tried to load auto_gptq, but auto-gptq is not installed ",
+                        "please install auto-gptq via pip install lm-eval[gptq] or pip install -e .[gptq]",
+                    )
+                self._model = AutoGPTQForCausalLM.from_quantized(
+                    pretrained,
+                    trust_remote_code=trust_remote_code,
+                    model_basename=None if autogptq is True else Path(autogptq).stem,
+                    use_safetensors=True
+                    if autogptq is True
+                    else autogptq.endswith(".safetensors"),
+                    **model_kwargs,
+                )
+            if gptqmodel:
+                try:
+                    from gptqmodel import GPTQModel
+                except ModuleNotFoundError as exception:
+                    raise type(exception)(
+                        "Tried to load gptqmodel, but gptqmodel is not installed ",
+                        "please install gptqmodel via `pip install gptqmodel --no-build-isolation` or `pip install lm-eval[gptqmodel] --no-build-isolation`",
+                    )
+                self._model = GPTQModel.from_quantized(
+                    pretrained, trust_remote_code=trust_remote_code, **model_kwargs
+                )
+        if peft and delta:
+            raise ValueError(
+                "Cannot use both 'peft' and 'delta' options at the same time."
+            )
+        if peft:
+            if model_kwargs.get("load_in_4bit", None):
+                if version.parse(PEFT_VERSION) < version.parse("0.4.0"):
+                    raise AssertionError("load_in_4bit requires peft >= 0.4.0")
+            if self._model.config.vocab_size != len(self.tokenizer):
+                # resize model for LoRAs with added tokens
+                eval_logger.info(
+                    f"Model config indicates vocab_size='{self._model.config.vocab_size}', but found tokenizer with vocab size '{len(self.tokenizer)}'. Resizing model embedding layer..."
+                )
+                self._model.resize_token_embeddings(len(self.tokenizer))
+            self._model = PeftModel.from_pretrained(
+                self._model, peft, revision=revision
+            )
+        elif delta:
+            if autogptq:
+                eval_logger.warning(
+                    "Delta weights might trigger unexpected behavior when used with AutoGPTQ."
+                )
+            _model_delta = self.AUTO_MODEL_CLASS.from_pretrained(
+                delta,
+                revision=revision,
+                torch_dtype=get_dtype(dtype),
+                trust_remote_code=trust_remote_code,
+                **model_kwargs,
+            )
+            for name, param in self._model.state_dict().items():
+                try:
+                    param.data += _model_delta.state_dict()[name]
+                except KeyError:
+                    raise KeyError(f"Delta model is missing weights for layer: {name}")
+                except Exception as e:
+                    raise RuntimeError(
+                        f"Failed to add delta weights to layer {name}. Error: {e}"
+                    )
+            del _model_delta
+        return None
+    def _create_tokenizer(
+        self,
+        pretrained: Union[str, transformers.PreTrainedModel],
+        tokenizer: Optional[
+            Union[
+                str,
+                transformers.PreTrainedTokenizer,
+                transformers.PreTrainedTokenizerFast,
+            ]
+        ],
+        revision: Optional[str] = "main",
+        trust_remote_code: Optional[bool] = False,
+        use_fast_tokenizer: Optional[bool] = True,
+        gguf_file: Optional[str] = None,
+        add_bos_token: Optional[bool] = False,
+        subfolder: Optional[str] = "",
+    ) -> None:
+        """
+        Helper method during initialization.
+        Create a tokenizer object corresponding to the correct
+        tokenizer for value of `pretrained`, or use the pre-initialized tokenizer passed.
+        """
+        kwargs = {
+            "revision": revision,
+            "trust_remote_code": trust_remote_code,
+        }
+        # gguf format embeds tokenizer and is not compatible with hf tokenizer `use_fast` param
+        if gguf_file is not None:
+            kwargs["gguf_file"] = gguf_file
+        else:
+            kwargs["use_fast"] = use_fast_tokenizer
+        if add_bos_token:
+            kwargs["add_bos_token"] = True
+        if subfolder:
+            kwargs["subfolder"] = subfolder
+        if tokenizer:
+            if isinstance(tokenizer, str):
+                self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+                    tokenizer, **kwargs
+                )
+            else:
+                assert isinstance(
+                    tokenizer, transformers.PreTrainedTokenizer
+                ) or isinstance(tokenizer, transformers.PreTrainedTokenizerFast)
+                self.tokenizer = tokenizer
+        else:
+            # Get tokenizer based on 'pretrained'
+            if isinstance(pretrained, str):
+                model_name = pretrained
+            else:
+                # get the HF hub name via accessor on model
+                model_name = self.model.name_or_path
+            self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+                model_name, **kwargs
+            )
+        return None
+    def _detect_batch_size(self, requests=None, pos: int = 0):
+        if requests:
+            _, context_enc, continuation_enc = requests[pos]
+            max_length = len(
+                (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1]
+            )
+            max_context_enc = len(context_enc[-(self.max_length + 1) :])
+            max_cont_enc = len(continuation_enc[-(self.max_length + 1) :])
+        else:
+            max_length = self.max_length
+            max_context_enc = max_length
+            max_cont_enc = max_length
+        # if OOM, then halves batch_size and tries again
+        @find_executable_batch_size(starting_batch_size=self.max_batch_size)
+        def forward_batch(batch_size):
+            if self.backend == "seq2seq":
+                length = max(max_context_enc, max_cont_enc)
+                batched_conts = torch.ones(
+                    (batch_size, length), device=self.device
+                ).long()
+                test_batch = torch.ones((batch_size, length), device=self.device).long()
+                call_kwargs = {
+                    "attn_mask": test_batch,
+                    "labels": batched_conts,
+                }
+            else:
+                call_kwargs = {}
+                test_batch = torch.ones(
+                    (batch_size, max_length), device=self.device
+                ).long()
+            for _ in range(5):
+                out = F.log_softmax(  # noqa: F841
+                    self._model_call(test_batch, **call_kwargs),
+                    dim=-1,
+                    dtype=self.softmax_dtype,
+                )
+            return batch_size
+        try:
+            batch_size = forward_batch()
+        except RuntimeError as e:
+            if "No executable batch size found" in str(e):
+                batch_size = 1
+            else:
+                raise
+        if self.world_size > 1:
+            # if multi-GPU, always take minimum over all selected batch sizes
+            max_rnk_bs = torch.tensor([batch_size], device=self.device)
+            gathered = (
+                self.accelerator.gather(max_rnk_bs).cpu().detach().numpy().tolist()
+            )
+            batch_size = min(gathered)
+            clear_torch_cache()
+            return batch_size
+        clear_torch_cache()
+        return batch_size
+    def tok_encode(
+        self, string: str, left_truncate_len=None, add_special_tokens=None
+    ) -> List[int]:
+        """ """
+        # default for None - empty dict, use predefined tokenizer param
+        # used for all models except for CausalLM or predefined value
+        special_tokens_kwargs = {}
+        # by default for CausalLM - false or self.add_bos_token is set
+        if add_special_tokens is None:
+            if self.backend == "causal":
+                special_tokens_kwargs = {
+                    "add_special_tokens": False or self.add_bos_token
+                }
+        # otherwise the method explicitly defines the value
+        else:
+            special_tokens_kwargs = {"add_special_tokens": add_special_tokens}
+        encoding = self.tokenizer.encode(string, **special_tokens_kwargs)
+        # left-truncate the encoded context to be at most `left_truncate_len` tokens long
+        if left_truncate_len:
+            encoding = encoding[-left_truncate_len:]
+        return encoding
+    def tok_batch_encode(
+        self,
+        strings: List[str],
+        padding_side: str = "left",
+        left_truncate_len: int = None,
+        truncation: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # encode a batch of strings. converts to tensors and pads automatically, unlike tok_encode.
+        old_padding_side = self.tokenizer.padding_side
+        self.tokenizer.padding_side = padding_side
+        add_special_tokens = {}
+        if self.backend == "causal":
+            add_special_tokens = {"add_special_tokens": False or self.add_bos_token}
+        encoding = self.tokenizer(
+            strings,
+            truncation=truncation,
+            padding="longest",
+            return_tensors="pt",
+            **add_special_tokens,
+        )
+        if left_truncate_len:
+            original_lengths = encoding["input_ids"].size(1)
+            if original_lengths > left_truncate_len:
+                eval_logger.warn(
+                    f"Left truncation applied. Original sequence length was {original_lengths}, "
+                    f"truncating to last {left_truncate_len} tokens. Some content will be lost.",
+                )
+            encoding["input_ids"] = encoding["input_ids"][:, -left_truncate_len:]
+            encoding["attention_mask"] = encoding["attention_mask"][
+                :, -left_truncate_len:
+            ]
+        self.tokenizer.padding_side = old_padding_side
+        return encoding["input_ids"], encoding["attention_mask"]
+    def tok_decode(self, tokens, skip_special_tokens=True):
+        return self.tokenizer.decode(tokens, skip_special_tokens=skip_special_tokens)
+    def _model_call(self, inps, attn_mask=None, labels=None):
+        """
+        :param inps: torch.Tensor
+            A torch tensor of shape [batch, (sequence_ctx + sequence_cont)] or of shape
+            [batch, sequence_ctx]. the size of sequence may vary from call to call
+        :param attn_mask: torch.Tensor, optional
+            A torch tensor of shape [batch, (sequence_ctx + sequence_cont)]. Only passed
+            (and must be passed) if self.AUTO_MODEL_CLASS is transformers.AutoModelForSeq2SeqLM
+        :param labels: torch.Tensor, optional
+            A torch tensor of shape [batch, (sequence_ctx + sequence_cont)]. Only passed
+            (and must be passed) if self.AUTO_MODEL_CLASS is transformers.AutoModelForSeq2SeqLM
+        :return
+            A torch tensor of shape [batch, sequence, vocab] with the
+        logits returned from the model's decoder
+        """
+        with torch.no_grad():
+            if attn_mask is not None or labels is not None:
+                assert attn_mask is not None and labels is not None
+                assert self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM
+                return self.model(
+                    input_ids=inps, attention_mask=attn_mask, labels=labels
+                ).logits
+            else:
+                assert self.AUTO_MODEL_CLASS in (
+                    transformers.AutoModelForCausalLM,
+                    transformers.AutoModelForVision2Seq,
+                )
+                return self.model(inps).logits
+    def _model_generate(self, context, max_length, stop, **generation_kwargs):
+        # temperature = 0.0 if not set
+        # if do_sample is false and temp==0.0:
+        # remove temperature, as do_sample=False takes care of this
+        # and we don't want a warning from HF
+        generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
+        do_sample = generation_kwargs.get("do_sample", None)
+        # The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies
+        if generation_kwargs.get("temperature") == 0.0 and do_sample is None:
+            generation_kwargs["do_sample"] = do_sample = False
+        if do_sample is False and generation_kwargs.get("temperature") == 0.0:
+            generation_kwargs.pop("temperature")
+        # build stopping criteria
+        stopping_criteria = stop_sequences_criteria(
+            self.tokenizer, stop, context.shape[1], context.shape[0]
+        )
+        return self.model.generate(
+            input_ids=context,
+            max_length=max_length,
+            stopping_criteria=stopping_criteria,
+            pad_token_id=self.tokenizer.pad_token_id,
+            use_cache=True,
+            **generation_kwargs,
+        )
+    def _select_cont_toks(
+        self, logits: torch.Tensor, contlen: int = None, inplen: int = None
+    ) -> torch.Tensor:
+        if self.backend == "causal":
+            assert contlen and inplen, (
+                "Must pass input len and cont. len to select scored logits for causal LM"
+            )
+            # discard right-padding.
+            # also discard the input/context tokens. we'll only score continuations.
+            logits = logits[inplen - contlen : inplen]
+        elif self.backend == "seq2seq":
+            assert contlen and not inplen, (
+                "Selecting scored logits for Seq2SeqLM requires only cont. len"
+            )
+            # only discard right-padding.
+            # the logits input to this fn only contain decoder-side tokens.
+            logits = logits[:contlen]
+        return logits
+    def loglikelihood_rolling(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[float]:
+        adaptive_batch_size = None
+        if self.batch_size == "auto":
+            # using rolling window with maximum context
+            print("Passed argument batch_size = auto. Detecting largest batch size")
+            batch_size = self._detect_batch_size()
+            print(f"Determined Largest batch size: {batch_size}")
+            adaptive_batch_size = batch_size
+        # First, collect all windows from all requests
+        all_windows = []  # List of (request_idx, window) tuples
+        request_window_counts = []  # Track number of windows per request
+        for req_idx, (string,) in enumerate(
+            tqdm(
+                [req.args for req in requests],
+                disable=(disable_tqdm or (self.rank != 0)),
+            )
+        ):
+            rolling_token_windows: List[Tuple[List[int], List[int]]] = list(
+                map(
+                    utils.make_disjoint_window,
+                    utils.get_rolling_token_windows(
+                        token_list=self.tok_encode(string),
+                        prefix_token=self.prefix_token_id,
+                        max_seq_len=self.max_length,
+                        context_len=1,
+                    ),
+                )
+            )
+            # TODO: Right now, we pass single EOT token to the Encoder and the full context to the decoder, in seq2seq case
+            windows = [(None,) + x for x in rolling_token_windows]
+            # Store windows with their request index
+            all_windows.extend((req_idx, window) for window in windows)
+            request_window_counts.append(len(windows))
+        # Handle distributed case padding
+        pad_amnt = 0
+        if self.world_size > 1:
+            mytensor = torch.tensor(len(all_windows), device=self.device)
+            gathered = self.accelerator.gather(mytensor).cpu().detach().numpy().tolist()
+            pad_amnt = max(gathered) - gathered[self.rank]
+            if pad_amnt > 0:
+                all_windows += pad_amnt * [all_windows[0]]
+        all_nlls = []
+        batch_size = adaptive_batch_size or self.batch_size
+        for i in range(0, len(all_windows), batch_size):
+            batch = all_windows[i : i + batch_size]
+            # Extract just the windows for processing, keeping track of request indices
+            batch_indices, batch_windows = zip(*batch)
+            batch_nlls = self._loglikelihood_tokens(
+                requests=batch_windows,
+                disable_tqdm=False,
+                override_bs=len(batch_windows),
+            )
+            # Store results with their request indices
+            all_nlls.extend(zip(batch_indices, batch_nlls))
+        # Remove padding if necessary
+        if (self.world_size > 1) and (pad_amnt > 0):
+            all_nlls = all_nlls[:-pad_amnt]
+        # Reconstruct per-request loglikelihoods
+        loglikelihoods = []
+        current_idx = 0
+        for window_count in request_window_counts:
+            # Get all nlls for this request
+            request_nlls = all_nlls[current_idx : current_idx + window_count]
+            # Sum up the nlls for this request (discarding is_greedy)
+            request_total = sum(nll[0] for _, nll in request_nlls)
+            loglikelihoods.append(request_total)
+            current_idx += window_count
+            string = requests[len(loglikelihoods) - 1].args[0]
+            self.cache_hook.add_partial(
+                "loglikelihood_rolling", (string,), request_total
+            )
+        return loglikelihoods
+    def _batch_scheduler(self, pos, n_reordered_requests):
+        sched = pos // int(len(n_reordered_requests) / self.batch_schedule)
+        if sched in self.batch_sizes:
+            return self.batch_sizes[sched]
+        if (len(self.batch_sizes) > 1) and (
+            self.batch_sizes[sched - 1] == self.max_batch_size
+        ):
+            # if previous batch size is already maximal, skip recomputation
+            self.batch_sizes[sched] = self.max_batch_size
+            return self.batch_sizes[sched]
+        print(
+            f"Passed argument batch_size = auto:{self.batch_schedule}. Detecting largest batch size"
+        )
+        self.batch_sizes[sched] = self._detect_batch_size(n_reordered_requests, pos)
+        print(f"Determined largest batch size: {self.batch_sizes[sched]}")
+        return self.batch_sizes[sched]
+    def _loglikelihood_tokens(
+        self,
+        requests: List[Tuple[Tuple[str, str], List[int], List[int]]],
+        disable_tqdm: bool = False,
+        override_bs: int = None,
+    ) -> List[Tuple[float, bool]]:
+        # TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context
+        res = []
+        def _collate(req: Tuple[Tuple[str, str], List[int], List[int]]):
+            """Defines the key for the sorted method"""
+            # the negative sign on len(toks) sorts descending - this has a few advantages:
+            # - time estimates will always be over not underestimates, which is more useful for planning
+            # - to know the size of a batch when going through the list, you know the first one is always the batch
+            #   padded context length. this is useful to simplify the batching logic and more importantly to make
+            #   automatic adaptive batches much much easier to implement
+            # - any OOMs will happen right away rather than near the end
+            toks = req[1] + req[2]
+            return -len(toks), tuple(toks)
+        def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]):
+            """Defines the key to group and lookup one-token continuations"""
+            # Use with group_by="contexts" (optional)"
+            # allows for the creation of a lookup, so we can reuse logits in case of one-token continuations.
+            # speeds up some multiple-choice tasks proportionally to the number of choices.
+            # groups requests by context+continuation[:-1] and infer on one request/group.
+            return req[-2] + req[-1][:-1]
+        re_ord = Collator(
+            requests,
+            sort_fn=_collate,
+            group_by="contexts"
+            if self.backend == "causal" and self.logits_cache
+            else None,
+            group_fn=_lookup_one_token_cont,
+        )
+        # automatic (variable) batch size detection for vectorization
+        # pull longest context sample from request
+        n_reordered_requests = len(re_ord)
+        batch_size = (
+            self.batch_size
+            if self.batch_size != "auto"
+            else override_bs
+            if override_bs is not None
+            else 0
+        )
+        batch_fn = (
+            self._batch_scheduler
+            if self.batch_size == "auto"
+            and n_reordered_requests > 0
+            and not override_bs
+            else None
+        )
+        chunks = re_ord.get_batched(n=batch_size, batch_fn=batch_fn)
+        pbar = tqdm(
+            total=len(requests),
+            disable=(disable_tqdm or (self.rank != 0)),
+            desc="Running loglikelihood requests",
+        )
+        for chunk in chunks:
+            inps = []
+            cont_toks_list = []
+            inplens = []
+            conts = []
+            encoder_attns = []
+            padding_len_inp = None
+            padding_len_cont = None
+            # because vectorizing is annoying, we first convert each (context, continuation) pair to padded
+            # tensors, then we pack them together into a batch, call the model, and then pick it all apart
+            # again because vectorizing is annoying
+            for _, context_enc, continuation_enc in chunk:
+                # sanity check
+                assert len(context_enc) > 0
+                assert len(continuation_enc) > 0
+                assert len(continuation_enc) <= self.max_length
+                # how this all works (illustrated on a causal decoder-only setup):
+                #          CTX      CONT
+                # inp    0 1 2 3|4 5 6 7 8 9   <- last token is deleted by inp[:, :-1]
+                # model  \               \
+                # logits   1 2 3|4 5 6 7 8 9   <- the ctx half gets tossed out by the
+                # cont_toks      4 5 6 7 8 9      [:, -len(continuation_enc):, :self.vocab_size] slice
+                # when too long to fit in context, truncate from the left
+                if self.backend == "causal":
+                    total_length = len(context_enc) + len(continuation_enc)
+                    if total_length > self.max_length + 1:
+                        eval_logger.warning(
+                            f"Combined length of context ({len(context_enc)}) and continuation ({len(continuation_enc)}) "
+                            f"exceeds model's maximum length ({self.max_length}). "
+                            f"Truncating {total_length - self.max_length + 1} tokens from the left."
+                        )
+                    inp = torch.tensor(
+                        (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1],
+                        dtype=torch.long,
+                        device=self.device,
+                    )
+                    (inplen,) = inp.shape
+                elif self.backend == "seq2seq":
+                    inp = torch.tensor(
+                        (context_enc)[-self.max_length :],
+                        dtype=torch.long,
+                        device=self.device,
+                    )
+                    (inplen,) = inp.shape
+                    # build encoder attn masks
+                    encoder_attns.append(torch.ones_like(inp))
+                    cont = torch.tensor(
+                        (continuation_enc)[-self.max_length :],
+                        # TODO: left-shift these?
+                        # TODO: our code assumes we never end up truncating conts for either model type
+                        dtype=torch.long,
+                        device=self.device,
+                    )
+                    (contlen,) = cont.shape
+                    conts.append(cont)
+                    padding_len_cont = (
+                        max(padding_len_cont, contlen)
+                        if padding_len_cont is not None
+                        else contlen
+                    )
+                padding_len_inp = (
+                    max(padding_len_inp, inplen)
+                    if padding_len_inp is not None
+                    else inplen
+                )
+                inps.append(inp)  # [1, inp_length]
+                cont_toks_list.append(continuation_enc)
+                inplens.append(inplen)
+            # create encoder attn mask and batched conts, if seq2seq
+            call_kwargs = {}
+            if self.backend == "causal":
+                batched_inps = pad_and_concat(
+                    padding_len_inp, inps, padding_side="right"
+                )  # [batch, padding_len_inp]
+            elif self.backend == "seq2seq":
+                # TODO: left-pad encoder inps and mask?
+                batched_inps = pad_and_concat(
+                    padding_len_inp, inps
+                )  # [batch, padding_len_inp]
+                batched_conts = pad_and_concat(
+                    padding_len_cont, conts
+                )  # [batch, padding_len_cont]
+                batched_encoder_mask = pad_and_concat(
+                    padding_len_inp, encoder_attns
+                )  # [batch, padding_len_inp]
+                call_kwargs = {
+                    "attn_mask": batched_encoder_mask,
+                    "labels": batched_conts,
+                }
+            multi_logits = F.log_softmax(
+                self._model_call(batched_inps, **call_kwargs),
+                dim=-1,
+                dtype=self.softmax_dtype,
+            )  # [batch, padding_length (inp or cont), vocab]
+            for (request_str, ctx_tokens, _), logits, inplen, cont_toks in zip(
+                chunk, multi_logits, inplens, cont_toks_list
+            ):
+                # Slice to original seq length
+                contlen = len(cont_toks)
+                # take only logits in the continuation
+                # (discard context toks if decoder-only ; discard right-padding)
+                # also discards + checks for "virtual tokens" in the causal LM's input window
+                # from prompt/prefix tuning tokens, if applicable
+                ctx_len = (
+                    inplen + (logits.shape[0] - padding_len_inp)
+                    if self.backend == "causal"
+                    else None
+                )
+                logits = self._select_cont_toks(logits, contlen=contlen, inplen=ctx_len)
+                logits = logits.unsqueeze(0)  # [1, seq, vocab]
+                # Check if per-token argmax is exactly equal to continuation
+                greedy_tokens = logits.argmax(dim=-1)
+                # check for one-token continuation cache hits.
+                # noop in case group_by != "contexts" or no cache hit and returns the
+                # original args. Otherwise, expands the logits batch dimension and yields each
+                # batch along with matching continuation tokens and prompt strings.
+                # logits -> [1, seq, vocab]
+                for request_str, cont_toks, logits in re_ord.get_cache(
+                    req_str=request_str,
+                    cxt_toks=ctx_tokens,
+                    cont_toks=cont_toks,
+                    logits=logits,
+                ):
+                    cont_toks = torch.tensor(
+                        cont_toks, dtype=torch.long, device=self.device
+                    ).unsqueeze(0)  # [1, seq]
+                    # Use trailing slice [-cont_toks.shape[1]:] to handle variable length cont_len (but same ctx+cont[:-1]).
+                    # i.e. continuations can be sliced at diff points. Collator ensures we have sufficient greedy_tokens
+                    # by choosing key with longest cont if group_by="contexts".
+                    max_equal = (
+                        greedy_tokens[:, -cont_toks.shape[1] :] == cont_toks
+                    ).all()
+                    # Obtain log-probs at the corresponding continuation token indices
+                    # last_token_slice = logits[:, -1, :].squeeze(0).tolist()
+                    logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(
+                        -1
+                    )  # [1, seq]
+                    # Answer: (log prob, is-exact-match)
+                    answer = (float(logits.sum()), bool(max_equal))
+                    res.append(answer)
+                    if request_str is not None:
+                        # special case: loglikelihood_rolling produces a number of loglikelihood requests
+                        # all with cache key None. instead do add_partial on the per-example level
+                        # in the loglikelihood_rolling() function for those.
+                        self.cache_hook.add_partial(
+                            "loglikelihood", request_str, answer
+                        )
+                    pbar.update(1)
+        pbar.close()
+        return re_ord.get_original(res)
+    def generate_until(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[str]:
+        res = []
+        def _collate(req: Tuple[str, dict]):
+            """Defines the key for the sorted method"""
+            # the negative sign on len(toks) sorts descending - this has a few advantages:
+            # - time estimates will always be over not underestimates, which is more useful for planning
+            # - to know the size of a batch when going through the list, you know the first one is always the batch
+            #   padded context length. this is useful to simplify the batching logic and more importantly to make
+            #   automatic adaptive batches much much easier to implement
+            # - any OOMs will happen right away rather than near the end
+            toks = self.tok_encode(req[0])
+            return -len(toks), req[0]
+        pbar = tqdm(
+            total=len(requests),
+            disable=(disable_tqdm or (self.rank != 0)),
+            desc="Running generate_until requests",
+        )
+        adaptive_batch_size = None
+        if self.batch_size == "auto":
+            # using rolling window with maximum context
+            print("Passed argument batch_size = auto. Detecting largest batch size")
+            batch_size = self._detect_batch_size()
+            print(f"Determined Largest batch size: {batch_size}")
+            adaptive_batch_size = batch_size
+        # for each different set of kwargs, we execute all requests, by batch.
+        batch_size = (
+            self.batch_size
+            if self.batch_size != "auto"
+            else adaptive_batch_size
+            if adaptive_batch_size is not None
+            else 0
+        )
+        batch_fn = (
+            self._batch_scheduler
+            if self.batch_size == "auto" and not adaptive_batch_size
+            else None
+        )
+        # we group requests by their generation_kwargs,
+        # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
+        # in the same batch.
+        # group_fn=lambda x: x[1] -> x=(context, gen_kwargs)
+        re_ords = Collator(
+            [reg.args for reg in requests],
+            sort_fn=_collate,
+            group_by="gen_kwargs",
+            group_fn=lambda x: x[1],
+        )
+        chunks = re_ords.get_batched(n=batch_size, batch_fn=batch_fn)
+        eos = self.tok_decode(self.eot_token_id, skip_special_tokens=False)
+        for chunk in chunks:
+            contexts, all_gen_kwargs = zip(*chunk)
+            # we assume all gen kwargs in the batch are the same
+            # this is safe to assume because the `grouper` object ensures it.
+            gen_kwargs = all_gen_kwargs[0]
+            # unpack our keyword arguments.
+            if isinstance(gen_kwargs, dict):
+                kwargs = copy.deepcopy(gen_kwargs)  # edge case for repeats > 1
+                # add EOS token to stop sequences
+                until = handle_stop_sequences(kwargs.pop("until", None), eos=eos)
+            else:
+                raise ValueError(
+                    f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
+                )
+            if "max_gen_toks" in kwargs.keys():
+                max_gen_toks = kwargs.pop("max_gen_toks")
+            else:
+                max_gen_toks = self.max_gen_toks
+            # set the max length in tokens of inputs ("context_enc")
+            if self.backend == "causal":
+                # max len for inputs = max length, minus room to generate the max new tokens
+                max_ctx_len = self.max_length - max_gen_toks
+                assert max_ctx_len > 0, (
+                    f"Invalid configuration: requested max tokens to generate ({max_gen_toks}) must be less than model's maximum sequence length ({self.max_length})."
+                )
+            elif self.backend == "seq2seq":
+                # max len for inputs = encoder's whole max_length
+                max_ctx_len = self.max_length
+            # encode, pad, and truncate contexts for this batch
+            context_enc, attn_masks = self.tok_batch_encode(
+                contexts,
+                left_truncate_len=max_ctx_len,
+                truncation=self.truncation,
+            )
+            context_enc = context_enc.to(self.device)
+            attn_masks = attn_masks.to(self.device)
+            if "max_length" not in kwargs:
+                kwargs["max_length"] = context_enc.shape[1] + max_gen_toks
+            # perform batched generation
+            cont = self._model_generate(
+                context=context_enc,
+                attention_mask=attn_masks,
+                stop=until,
+                **kwargs,
+            )
+            cont_toks_list = cont.tolist()
+            for cont_toks, context in zip(cont_toks_list, contexts):
+                # discard context + left-padding toks if using causal decoder-only LM
+                if self.backend == "causal":
+                    cont_toks = cont_toks[context_enc.shape[1] :]
+                s = self.tok_decode(cont_toks)
+                # use secondary stop seqs to cut off should-have-been-stopped content post-hoc
+                for term in until:
+                    if len(term) > 0:
+                        # ignore '' separator,
+                        # for seq2seq case where self.tok_decode(self.eot_token_id) = ''
+                        s = s.split(term)[0]
+                res.append(s)
+                self.cache_hook.add_partial("generate_until", (context, gen_kwargs), s)
+                pbar.update(1)
+        # reorder this group of results back to original unsorted form
+        res = re_ords.get_original(res)
+        pbar.close()
+        return res
+    def apply_chat_template(
+        self, chat_history: List[Dict[str, str]], add_generation_prompt: bool = True
+    ) -> str:
+        """
+        Method to apply a chat template to a list of chat history between user and model.
+        """
+        try:
+            chat_templated = self.tokenizer.apply_chat_template(
+                chat_history,
+                tokenize=False,
+                add_generation_prompt=add_generation_prompt,
+                continue_final_message=not add_generation_prompt,
+            )
+        except jinja2.exceptions.TemplateError:
+            eval_logger.warning(
+                "Failed to apply chat template. removing the system role in chat history."
+            )
+            chat_history = [msg for msg in chat_history if msg["role"] != "system"]
+            chat_templated = self.tokenizer.apply_chat_template(
+                chat_history,
+                tokenize=False,
+                add_generation_prompt=add_generation_prompt,
+                continue_final_message=not add_generation_prompt,
+            )
+        return chat_templated
+    def get_model_info(self) -> dict:
+        """
+        Method to get Hugging Face model information for experiment reproducibility.
+        """
+        def get_model_num_params(model) -> int:
+            if hasattr(model, "num_parameters"):
+                return model.num_parameters()
+            if hasattr(model, "parameters"):
+                return sum(p.numel() for p in model.parameters())
+            else:
+                return -1
+        def get_model_dtype(model) -> str:
+            if hasattr(model, "dtype"):
+                return model.dtype
+            else:
+                return ""
+        def get_model_sha(pretrained: str, revision: str) -> str:
+            try:
+                model_info = HfApi().model_info(repo_id=pretrained, revision=revision)
+                return model_info.sha
+            except Exception as e:
+                eval_logger.debug(
+                    f"Failed to get model SHA for {pretrained} at revision {revision}. Error: {e}"
+                )
+                return ""
+        model_info = {
+            "model_num_parameters": get_model_num_params(self._model),
+            "model_dtype": get_model_dtype(self._model),
+            "model_revision": self.revision,
+            "model_sha": get_model_sha(self.pretrained, self.revision),
+        }
+        if self.peft:
+            model_info["peft_sha"] = get_model_sha(self.peft, self.revision)
+        if self.delta:
+            model_info["delta_sha"] = get_model_sha(self.delta, self.revision)
+        return model_info

Prism/LLaDA/LLaDA_Baseline/dllm_eval/models/utils.py ADDED Viewed

	@@ -0,0 +1,854 @@

+import collections
+import fnmatch
+import gc
+import itertools
+import logging
+import time
+from functools import wraps
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    Literal,
+    Optional,
+    Tuple,
+    Type,
+    Union,
+)
+import torch
+import transformers
+eval_logger = logging.getLogger(__name__)
+if TYPE_CHECKING:
+    from PIL import Image
+    from transformers import PreTrainedTokenizerBase
+    from transformers.configuration_utils import PretrainedConfig
+def chunks(iter, n: int = 0, fn=None):
+    """
+    Divides an iterable into chunks of specified size or based on a given function.
+    Useful for batching
+    Parameters:
+    - iter: The input iterable to be divided into chunks.
+    - n: An integer representing the size of each chunk. Default is 0.
+    - fn: A function that takes the current index and the iterable as arguments and returns the size of the chunk. Default is None.
+    Returns:
+    An iterator that yields chunks of the input iterable.
+    Example usage:
+    ```
+    data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+    for chunk in chunks(data, 3):
+        print(chunk)
+    ```
+    Output:
+    ```
+    [1, 2, 3]
+    [4, 5, 6]
+    [7, 8, 9]
+    [10]
+    ```
+    """
+    arr = []
+    for i, x in enumerate(iter):
+        arr.append(x)
+        if len(arr) == (fn(i, iter) if fn else n):
+            yield arr
+            arr = []
+    if arr:
+        yield arr
+class MultiChoice:
+    def __init__(self, choices) -> None:
+        self.choices = choices
+    # Simple wildcard support (linux filename patterns)
+    def __contains__(self, values) -> bool:
+        for value in values.split(","):
+            if len(fnmatch.filter(self.choices, value)) == 0:
+                eval_logger.info("Available tasks to choose:")
+                for choice in self.choices:
+                    eval_logger.info(f"  - {choice}")
+                raise ValueError("'{}' is not in task list".format(value))
+        return True
+    def __iter__(self) -> Iterator:
+        for choice in self.choices:
+            yield choice
+class Grouper:
+    """
+    takes an array `arr` and function `fn` and returns a dictionary
+    with keys fn(ob) for each ob in `arr` and with values `self.arr[key]` a list of all
+    objects in `arr` satisfying `key == fn(ob)`.
+    """
+    def __init__(self, arr, fn) -> None:
+        # self.orig_arr = arr
+        self.size = len(arr)
+        arr = list(enumerate(arr))
+        def group_return_dict(arr, fn):
+            res = collections.defaultdict(list)
+            for ob in arr:
+                res[fn(ob)].append(ob)
+            return res
+        arr = group_return_dict(arr, lambda x: fn(x[1]))
+        # self.arr has format Dict[Tuple[int, <entry from orig. arr>]]
+        self.arr = arr
+        self._grouped = None
+    def get_grouped(self):
+        # return the contents but not indices for our grouped dict.
+        if self._grouped:
+            return self._grouped
+        grouped = {}
+        for key in self.arr.keys():
+            # drop the index from each element of self.arr
+            grouped[key] = [y[1] for y in self.arr[key]]
+        self._grouped = grouped
+        return grouped
+    def get_original(self, grouped_dict):
+        # take in a grouped dictionary with e.g. results for each key listed
+        # in the same order as the instances in `self.arr`, and
+        # return the results in the same (single list) order as `self.orig_arr`.
+        res = [None] * self.size
+        cov = [False] * self.size
+        # orig = [None] * self.size
+        assert grouped_dict.keys() == self.arr.keys()
+        for key in grouped_dict.keys():
+            for (ind, _), v in zip(self.arr[key], grouped_dict[key]):
+                res[ind] = v
+                cov[ind] = True
+                # orig[ind] = _
+        assert all(cov)
+        # assert orig == self.orig_arr
+        return res
+def pad_and_concat(
+    max_length: int,
+    tensors: List[torch.Tensor],
+    padding_side: Literal["right", "left"] = "right",
+):
+    """
+    Method for padding a list of tensors given the maximum tensor
+    length in the batch. Used for batching inputs and continuations in
+    seq2seq models.
+    """
+    assert padding_side == "left" or padding_side == "right", (
+        f"Unrecognized padding type: '{padding_side}' not 'left' or 'right'"
+    )
+    for i, tensor in enumerate(tensors):
+        if len(tensor.shape) == 2:
+            tensor = tensor.squeeze(0)  # squeeze, in case passed [1, seq] size
+        tensor_len = tensor.shape[0]
+        if tensor_len < max_length:
+            if padding_side == "right":
+                # right-pad
+                tensors[i] = torch.cat(
+                    [
+                        tensor,  # [seq]
+                        torch.zeros(
+                            max_length - tensor_len,
+                            dtype=torch.long,
+                            device=tensor.device,
+                        ),  # [padding_length - seq]
+                    ],
+                    dim=0,
+                ).unsqueeze(0)
+            else:
+                # left-pad
+                tensors[i] = torch.cat(
+                    [
+                        torch.zeros(
+                            max_length - tensor_len,
+                            dtype=torch.long,
+                            device=tensor.device,
+                        ),  # [padding_length - seq]
+                        tensor,  # [seq]
+                    ],
+                    dim=0,
+                ).unsqueeze(0)
+        else:
+            tensors[i] = tensor.unsqueeze(0)
+    return torch.cat(tensors, dim=0)
+def clear_torch_cache() -> None:
+    gc.collect()
+    torch.cuda.empty_cache()
+def get_dtype(dtype: Union[str, torch.dtype]) -> torch.dtype:
+    """Converts `dtype` from `str` to torch.dtype when possible. Does not use an instantiated HF AutoConfig"""
+    if isinstance(dtype, str) and dtype != "auto":
+        # Convert `str` args torch dtype: `float16` -> `torch.float16`
+        _torch_dtype = getattr(torch, dtype)
+    else:
+        _torch_dtype = dtype
+    return _torch_dtype
+class MultiTokenEOSCriteria(transformers.StoppingCriteria):
+    """Criteria to stop on the specified multi-token sequence."""
+    def __init__(
+        self,
+        sequence: str,
+        tokenizer: transformers.PreTrainedTokenizer,
+        initial_decoder_input_length: int,
+        batch_size: int,
+    ) -> None:
+        self.initial_decoder_input_length = initial_decoder_input_length
+        self.done_tracker = [False] * batch_size
+        self.sequence = sequence
+        self.sequence_ids = tokenizer.encode(sequence, add_special_tokens=False)
+        # print(sequence, self.sequence_ids)
+        # we look back for 2 more tokens than it takes to encode our stop sequence
+        # because tokenizers suck, and a model might generate `['\n', '\n']` but our `sequence` is `['\n\n']`
+        # and we don't want to mistakenly not stop a generation because our
+        # (string) stop sequence was output in a different tokenization
+        # NOTE: there is a minor danger that this will end up looking back 2 tokens into the past, into the inputs to the model,
+        # and stopping generation immediately as a result. With only 2 extra tokens of lookback, this risk is minimized
+        # Additionally, in lookback_ids_batch we should prevent ever looking back into the inputs as described.
+        self.sequence_id_len = len(self.sequence_ids) + 2
+        self.tokenizer = tokenizer
+    def __call__(self, input_ids, scores, **kwargs) -> bool:
+        # For efficiency, we compare the last n tokens where n is the number of tokens in the stop_sequence
+        lookback_ids_batch = input_ids[:, self.initial_decoder_input_length :]
+        lookback_ids_batch = lookback_ids_batch[:, -self.sequence_id_len :]
+        lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch)
+        for i, done in enumerate(self.done_tracker):
+            if not done:
+                self.done_tracker[i] = self.sequence in lookback_tokens_batch[i]
+        return False not in self.done_tracker
+def stop_sequences_criteria(
+    tokenizer: transformers.PreTrainedTokenizer,
+    stop_sequences: List[str],
+    initial_decoder_input_length: int,
+    batch_size: int,
+) -> transformers.StoppingCriteriaList:
+    return transformers.StoppingCriteriaList(
+        [
+            *[
+                MultiTokenEOSCriteria(
+                    sequence, tokenizer, initial_decoder_input_length, batch_size
+                )
+                for sequence in stop_sequences
+            ],
+        ]
+    )
+def undistribute(iterable):
+    """
+    Undoes https://more-itertools.readthedocs.io/en/stable/api.html#more_itertools.distribute .
+    Re-interleaves results that have been split using more_itertools.distribute:
+        >>> group_1, group_2 = distribute(2, [1, 2, 3, 4, 5, 6])
+        >>> list(group_1)
+        [1, 3, 5]
+        >>> list(group_2)
+        [2, 4, 6]
+        >>> undistribute([group_1, group_2])
+        [1, 2, 3, 4, 5, 6]
+    Handles non-uniform component lengths:
+        >>> children = distribute(3, [1, 2, 3, 4, 5, 6, 7])
+        >>> [list(c) for c in children]
+        [[1, 4, 7], [2, 5], [3, 6]]
+        >>> undistribute(children)
+        [1, 2, 3, 4, 5, 6, 7]
+    Also handles when some iterables are empty:
+        >>> children = distribute(5, [1, 2, 3])
+        >>> [list(c) for c in children]
+        [[1], [2], [3], [], []]
+        >>> undistribute(children)
+        [1, 2, 3]
+    """
+    return [
+        x
+        for x in itertools.chain.from_iterable(
+            itertools.zip_longest(*[list(x) for x in iterable])
+        )
+        if x is not None
+    ]
+def retry_on_specific_exceptions(
+    on_exceptions: List[Type[Exception]],
+    max_retries: Optional[int] = None,
+    backoff_time: float = 3.0,
+    backoff_multiplier: float = 1.5,
+    on_exception_callback: Optional[Callable[[Exception, float], Any]] = None,
+):
+    """Retry on an LLM Provider's rate limit error with exponential backoff
+    For example, to use for OpenAI, do the following:
+    ```
+    from openai import RateLimitError
+    # Recommend specifying max_retries to avoid infinite loops!
+    @retry_on_specific_exceptions([RateLimitError], max_retries=3)
+    def completion(...):
+        # Wrap OpenAI completion function here
+        ...
+    ```
+    """
+    def decorator(func: Callable):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            sleep_time = backoff_time
+            attempt = 0
+            while max_retries is None or attempt < max_retries:
+                try:
+                    return func(*args, **kwargs)
+                except tuple(on_exceptions) as e:
+                    if on_exception_callback is not None:
+                        on_exception_callback(e, sleep_time)
+                    time.sleep(sleep_time)
+                    sleep_time *= backoff_multiplier
+                    attempt += 1
+        return wrapper
+    return decorator
+class Collator:
+    """
+    A class for reordering and batching elements of an array.
+    This class allows for sorting an array based on a provided sorting function, grouping elements based on a grouping function, and generating batches from the sorted and grouped data.
+    Objects of this class have the group_by attribute which determines the method for grouping
+    the data while batching it. Three options include "gen_kwargs", "contexts", or None:
+        If group_by == "gen_kwargs" then requests will be grouped by gen_kwargs
+        If group_by == "contexts" then requests will be grouped by context + cont[:-1]
+        If None then requests will just be reordered by length descending.
+    """
+    def __init__(
+        self,
+        arr: List,
+        sort_fn: Callable = lambda x: x,
+        group_fn: Callable = lambda x: x[1],
+        group_by: Union[Literal["gen_kwargs", "contexts"], None] = None,
+    ) -> None:
+        self._group_by = group_by
+        # 0 indices are enumerated indices. Apply functions to original arr.
+        self._sort_fn = lambda x: sort_fn(x[1])
+        self._group_fn = lambda x: group_fn(x[1])
+        self._reorder_indices: List = []
+        self._size = len(arr)
+        self._arr_with_indices: Union[Dict, Tuple[Tuple[int, Any], ...]] = tuple(
+            enumerate(arr)
+        )  # [indices, (arr)]
+        if self._group_by == "contexts":
+            self._group_by_context()
+        elif self._group_by == "gen_kwargs":
+            self._group_by_index()
+    def _group_by_index(self) -> None:
+        """Group the elements of a list based on their indices."""
+        self._arr_with_indices = self.group(
+            self._arr_with_indices, fn=self._group_fn, group_by="gen_kwargs"
+        )
+    def _group_by_context(self) -> None:
+        """Group the array with indices by context."""
+        self._arr_with_indices = self.group(
+            self._arr_with_indices, fn=self._group_fn, group_by="contexts"
+        )
+    def get_batched(self, n: int = 1, batch_fn: Optional[Callable] = None) -> Iterator:
+        """
+        Generates and yields batches from the reordered array. The method of grouping and batching
+        depends on the parameter `group_by`.
+        If `group_by` is set to "gen_kwargs", it will batch the
+        re-ordered values with same gen_kwargs for each batch.
+        If `group_by` is "contexts", it caches the requests by context before batching.
+        If `group_by` is neither "gen_kwargs" nor "contexts", it yields the reordered array
+        Parameters:
+        - n (int): The size of each batch. Defaults to 1.
+        - batch_fn ([Callable[[int, Iterable], int]] | None): A function to determine the size of
+          each batch. Optional, defaults to None.
+        Returns:
+        Iterator: An iterator over batches of reordered elements grouped as per the `group_by`
+                  attribute.
+        Yields:
+        List of batched elements according to the `group_by` attribute.
+        """
+        if self._group_by == "gen_kwargs":
+            for (
+                key,
+                values,
+            ) in self._arr_with_indices.items():  # type: ignore
+                values = self._reorder(values)
+                batch = self.get_chunks(values, n=n, fn=batch_fn)
+                yield from batch
+        elif self._group_by == "contexts":
+            # Get one sample from each key.
+            # Select longest continuation per group to ensure sufficient context logits
+            values = self._reorder(
+                [
+                    max(value, key=lambda x: len(x[1][-1]))
+                    for value in self._arr_with_indices.values()
+                ]
+            )
+            batch = self.get_chunks(values, n=n, fn=batch_fn)
+            yield from batch
+        else:
+            values = self._reorder(self._arr_with_indices)  # type: ignore
+            batch = self.get_chunks(values, n=n, fn=batch_fn)
+            yield from batch
+    def get_cache(
+        self,
+        req_str: Tuple[str, str] = None,
+        cxt_toks: List[int] = None,
+        cont_toks: List[int] = None,
+        logits: torch.Tensor = None,
+    ) -> Iterator[Tuple[Tuple[str, str], List[int], torch.Tensor]]:
+        """
+        Retrieves cached single-token continuations and their associated arguments, updating indices as necessary.
+        The behavior of this function varies depending on how the `group_by` attribute is set:
+        - When `group_by` is "contexts":
+            The function identifies single-token continuations by checking for keys that equate to
+            [context+continuation][-1] and logs the indices for re-ordering.
+            In this mode, this function can work in two scenarios:
+            1. Cache Hit - Single Match:
+                If a single matching context-continuation pair is found in the cache,
+                the function yields the original arguments.
+            2. Cache Hit - Multiple Matches:
+                If multiple matching context-continuation pairs are found in the cache,
+                the function expands the logits batch dimension to match the number of cache hits.
+                It updates the original requests and continuation tokens.
+        - When `group_by` is not set to "contexts":
+            This method yields the original arguments, logits and continuation tokens,
+            without checking for one-token continuations.
+        Parameters:
+        - req_str (tuple[str, str]): Original strings used for CachingLM.
+        - cxt_toks (list[int]): Full context tokens used for lookup.
+        - cont_toks (list[int]): Continuation tokens for which logits were generated.
+        - logits (torch.Tensor [1, seq_length, vocab_size]): Logits generated by the model given context and continuation keys.
+        Yields:
+        - Iterator:
+            - req_str (tuple[str, str]): strings used for CachingLM.
+            - cont_toks (list[int]) : continuation tokens.
+            - logits (torch.Tensor [1, seq_length, vocab_size]): The original logits (repeated cache hit times)
+        """
+        if self._group_by == "contexts":
+            cache_hit: List[
+                Tuple[int, Tuple[Tuple[str, str], List[int], List[int]]]
+            ] = self._arr_with_indices.pop(tuple(cxt_toks + cont_toks[:-1]))
+            if (cache_size := len(cache_hit)) == 1:
+                self._reorder_indices.extend(x[0] for x in cache_hit)
+                yield req_str, cont_toks, logits
+            else:
+                # If we have matching requests then expand the batch dimension (no-op) and
+                # yield each along with its corresponding args.
+                multilogits = logits.expand(cache_size, -1, -1).chunk(cache_size)
+                indices, req_str, cont_toks = zip(
+                    *[(x[0], x[1][0], x[-1][-1]) for x in cache_hit]
+                )
+                self._reorder_indices.extend(indices)
+                for c_key, cont_tok, logit in zip(req_str, cont_toks, multilogits):
+                    yield c_key, cont_tok, logit
+        else:
+            yield req_str, cont_toks, logits
+    def _reorder(self, arr: Union[List, Tuple[Tuple[int, Any], ...]]) -> Iterator:
+        """
+        Reorders the elements in the array based on the sorting function.
+        Parameters:
+        - arr (list | tuple[tuple[int, Any], ...]]): The array or iterable to be reordered.
+        Yields:
+            Iterator
+        """
+        arr = sorted(arr, key=self._sort_fn)
+        if not self._group_by == "contexts":
+            # If grouped by contexts then indices will be set in get_cache()
+            self._reorder_indices.extend([x[0] for x in arr])
+        yield from [x[1] for x in arr]
+    def get_original(self, newarr: List) -> List:
+        """
+        Restores the original order of elements from the reordered list.
+        Parameters:
+        - newarr (list): The reordered array.
+        Returns:
+        list: The array with elements restored to their original order.
+        """
+        res = [None] * self._size
+        cov = [False] * self._size
+        for ind, v in zip(self._reorder_indices, newarr):
+            res[ind] = v
+            cov[ind] = True
+        assert all(cov)
+        return res
+    def __len__(self):
+        return self._size
+    @staticmethod
+    def group(
+        arr: Iterable,
+        fn: Callable,
+        group_by: Literal["gen_kwargs", "contexts"] = "gen_kwargs",
+    ) -> dict:
+        """
+        Groups elements of an iterable based on a provided function.
+        The `group_by` parameter determines the method of grouping.
+        If `group_by` is "contexts", the elements are grouped by [context + cont][:-1].
+        If `group_by` is "gen_kwargs", the elements are grouped based on the gen_kwargs dict.
+        Parameters:
+        - arr (Iterable): The iterable to be grouped.
+        - fn (Callable): The function to determine the grouping.
+        - values (bool): If True, returns the values of the group. Defaults to False.
+        Returns:
+        Iterator: An iterable of grouped elements.
+        """
+        res = collections.defaultdict(list)
+        for ob in arr:
+            # where ob == [context + cont]
+            if group_by == "contexts":
+                res[tuple(fn(ob))].append(ob)
+            else:
+                try:
+                    hashable_dict = tuple(
+                        (
+                            key,
+                            tuple(value)
+                            if isinstance(value, collections.abc.Iterable)
+                            else value,
+                        )
+                        for key, value in sorted(fn(ob).items())
+                    )
+                    res[hashable_dict].append(ob)
+                except (TypeError, AttributeError):
+                    res[tuple(fn(ob))].append(ob)
+        return res
+    @staticmethod
+    def get_chunks(_iter, n: int = 0, fn=None):
+        """
+        Divides an iterable into chunks of specified size or based on a given function.
+        Useful for batching
+        Parameters:
+        - iter: The input iterable to be divided into chunks.
+        - n: An integer representing the size of each chunk. Default is 0.
+        - fn: A function that takes the current index and the iterable as arguments and returns the size of the chunk. Default is None.
+        Returns:
+        An iterator that yields chunks of the input iterable.
+        Example usage:
+        ```
+        data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        for chunk in chunks(data, 3):
+            print(chunk)
+        ```
+        Output:
+        ```
+        [1, 2, 3]
+        [4, 5, 6]
+        [7, 8, 9]
+        [10]
+        ```
+        """
+        arr = []
+        _iter = tuple(_iter)
+        for i, x in enumerate(_iter):
+            arr.append(x)
+            if len(arr) == (fn(i, _iter) if fn else n):
+                yield arr
+                arr = []
+        if arr:
+            yield arr
+def configure_pad_token(
+    tokenizer: "PreTrainedTokenizerBase",
+    model_config: Optional["PretrainedConfig"] = None,
+) -> "PreTrainedTokenizerBase":
+    """
+    This function checks if the (Hugging Face) tokenizer has a padding token and sets it if not present.
+    Some tokenizers require special handling.
+    Args:
+        tokenizer: The tokenizer for which the padding token is to be handled.
+        model_config: The configuration of the model. Default is None.
+    Returns:
+        The tokenizer after the padding token has been handled.
+    Raises:
+        AssertionError: If the tokenizer is of type RWKVWorldTokenizer or Rwkv5Tokenizer and the padding token id is not 0.
+    """
+    if tokenizer.pad_token:
+        pass
+    elif tokenizer.unk_token:
+        tokenizer.pad_token_id = tokenizer.unk_token_id
+    elif tokenizer.eos_token:
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+    else:
+        # handle special cases
+        if model_config and getattr(model_config, "model_type", None) == "qwen":
+            # Qwen's trust_remote_code tokenizer does not allow for adding special tokens
+            tokenizer.pad_token = "<|endoftext|>"
+        elif (
+            tokenizer.__class__.__name__ == "RWKVWorldTokenizer"
+            or tokenizer.__class__.__name__ == "Rwkv5Tokenizer"
+        ):
+            # The RWKV world tokenizer, does not allow for adding special tokens / setting the pad token (which is set as 0)
+            # The additional tokenizer name check is needed, as there exists rwkv4 models with neox tokenizer
+            # ---
+            # Note that the world tokenizer class name, might change in the future for the final huggingface merge
+            # https://github.com/huggingface/transformers/pull/26963
+            assert tokenizer.pad_token_id == 0
+        else:
+            tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
+    return tokenizer
+def replace_placeholders(
+    string: str, default_placeholder: str, image_token: str, max_images: int
+):
+    """
+    A utility function used for local multimodal models. It locates all `placeholder` string
+    occurrences in the given input `string_` and replaces the first `max_count` instances with
+    `replacement`, and all subsequent occurrences with the empty string.
+    This is used to replace <image> placeholder tags by model-specific image tokens like <|image_pad|>
+    and to allow for only the first `max_count` images to be passed to a model if desired.
+    :param string: The original string containing placeholders.
+    :param default_placeholder: The placeholder text to be replaced.
+    :param image_token: The token to replace the placeholder with.
+    :param max_images: The maximum number of replacements to make.
+    :return: The string with placeholders replaced.
+    """
+    count = 0
+    result = []
+    parts = string.split(default_placeholder)
+    for part in parts[:-1]:  # Iterate through all but the last part
+        result.append(part)
+        if count < max_images:
+            result.append(image_token)
+            count += 1
+        elif default_placeholder != image_token:
+            result.append(default_placeholder)
+    # Add the last part of the string
+    result.append(parts[-1])
+    return "".join(result)
+def flatten_image_list(images: List[List]):
+    """
+    Takes in a list of lists of images, and returns a single list of all images in order.
+    Used for some multimodal models like Llava-1.5 which expects this flattened-list format for its image processor.
+    :param images: A list of lists of PIL images.
+    :return: a list of PIL images, via concatenating all the sub-lists in order.
+    """
+    return [image for image_list in images for image in image_list]
+def handle_stop_sequences(
+    until: Union[str, List[str], None], eos: Optional[str]
+) -> List[str]:
+    """Ensures that the `until` parameter is a list of stop sequences and includes the EOS token."""
+    if isinstance(until, str):
+        until = [until]
+    elif until is None:
+        until = []
+    elif not isinstance(until, list):
+        raise ValueError(
+            f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"
+        )
+    if eos is not None and eos not in until:
+        until.append(eos)
+    return until
+def resize_image(
+    image: "Image.Image",
+    width: Optional[int] = None,
+    height: Optional[int] = None,
+    max_dimension: Optional[int] = None,
+    keep_aspect_ratio: bool = True,
+    resample_filter: Union[int, str] = "Image.BICUBIC",
+    min_width: int = 1,
+    min_height: int = 1,
+) -> "Image.Image":
+    """
+    Resizes a PIL Image object with flexible options.
+    Args:
+        image: The PIL Image object to resize.
+        width: Target width in pixels.
+        height: Target height in pixels.
+        max_dimension: Maximum size for the longer dimension of the image.
+        keep_aspect_ratio: If True (default) and both width and height are provided,
+                          the image is resized to fit within these dimensions while
+                          maintaining its aspect ratio. If False, the image is stretched
+                          to the exact width and height.
+        resample_filter: The resampling filter to use for resizing.
+                        Defaults to Image.BICUBIC.
+        min_width: Minimum width for the resized image. Defaults to 1.
+        min_height: Minimum height for the resized image. Defaults to 1.
+    Returns:
+        The resized PIL Image object. If no resize parameters are provided
+        or if the image already meets the criteria, the original image is returned.
+    Order of precedence for resizing:
+    1. If width AND height are provided:
+       - If keep_aspect_ratio is True: Fits image within bounds, preserving aspect ratio.
+       - If keep_aspect_ratio is False: Resizes to exact dimensions (may distort).
+    2. Else if only width is provided: Calculates height proportionally.
+    3. Else if only height is provided: Calculates width proportionally.
+    4. Else if max_dimension is provided: Resizes the longest side to max_dimension
+       and scales the other side proportionally.
+    5. If none of the above are provided, returns the original image.
+    """
+    original_width, original_height = image.size
+    # If no arguments are provided, return the original image
+    if width is None and height is None and max_dimension is None:
+        return image
+    new_width = original_width
+    new_height = original_height
+    if width is not None and height is not None:
+        # No resize needed if image is already smaller than target dimensions
+        if original_width <= width and original_height <= height:
+            return image
+        if keep_aspect_ratio:
+            # Calculate the ratio to fit within the target dimensions
+            ratio = min(width / original_width, height / original_height)
+            new_width = int(original_width * ratio)
+            new_height = int(original_height * ratio)
+        else:
+            # Stretch to exact dimensions
+            new_width = width
+            new_height = height
+    elif width is not None:
+        # No resize needed if width is already smaller
+        if original_width <= width:
+            return image
+        # Calculate height proportionally
+        new_width = width
+        new_height = int((original_height / original_width) * new_width)
+    elif height is not None:
+        # No resize needed if height is already smaller
+        if original_height <= height:
+            return image
+        # Calculate width proportionally
+        new_height = height
+        new_width = int((original_width / original_height) * new_height)
+    elif max_dimension is not None:
+        # No resize needed if both dimensions are smaller than max_dimension
+        if max(original_height, original_width) <= max_dimension:
+            return image
+        if original_width > original_height:
+            # Width is the longer side
+            new_width = max_dimension
+            new_height = int((original_height / original_width) * new_width)
+        else:
+            # Height is the longer side or sides are equal
+            new_height = max_dimension
+            new_width = int((original_width / original_height) * new_height)
+    # Ensure dimensions are at least minimum values
+    new_width = max(min_width, new_width)
+    new_height = max(min_height, new_height)
+    # Perform the resize operation with the calculated dimensions
+    return image.resize((new_width, new_height), resample_filter)
+def truncate_tokens(
+    tokens: List[int],
+    max_length: int,
+    tokenizer: "PreTrainedTokenizerBase",
+    strategy: str = "left",
+):
+    if strategy == "left":
+        return tokens[-max_length:]
+    elif strategy == "right":
+        return tokens[:max_length]
+    elif strategy == "middle":
+        # Truncate the middle of the sequence
+        left_length = max_length // 2
+        right_length = max_length - left_length
+        return tokens[:left_length] + tokens[-right_length:]
+    return None

Prism/LLaDA/LLaDA_Baseline/dllm_eval/models/verifier.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import torch
+import logging
+import ast
+import re
+import numpy as np
+import textwrap
+logger = logging.getLogger(__name__)
+class CodeVerifier:
+    def __init__(self, model, tokenizer, device="cuda"):
+        self.model = model
+        self.tokenizer = tokenizer
+        self.device = device
+        self.yes_ids, self.no_ids = [], []
+        for t in ["Yes", " Yes", "YES"]:
+            ids = self.tokenizer.encode(t, add_special_tokens=False)
+            if len(ids) > 0: self.yes_ids.append(ids[-1])
+        for t in ["No", " No", "NO"]:
+            ids = self.tokenizer.encode(t, add_special_tokens=False)
+            if len(ids) > 0: self.no_ids.append(ids[-1])
+        self.yes_ids = list(set(self.yes_ids))
+        self.no_ids = list(set(self.no_ids))
+    def _extract_python_code(self, text):
+        text = text.strip()
+        match = re.search(r"```python\s*(.*?)```", text, re.DOTALL)
+        if match: return match.group(1)
+        match_generic = re.search(r"```\s*(.*?)```", text, re.DOTALL)
+        if match_generic: return match_generic.group(1)
+        return text
+    def check_syntax(self, code_str):
+        clean_code = self._extract_python_code(code_str)
+        try:
+            if len(clean_code.strip()) < 5: return False
+            ast.parse(clean_code)
+            return True
+        except:
+            return False
+    def compute_confidence(self, logits):
+        if logits is None: return 0.0
+        probs = torch.softmax(logits, dim=-1)
+        max_probs, _ = torch.max(probs, dim=-1)
+        log_probs = torch.log(max_probs + 1e-10)
+        return torch.exp(torch.mean(log_probs)).item()
+    def svf_score(self, prompt, code_str, task_type="code"):
+        max_len = 2000
+        if len(code_str) > max_len:
+            if task_type == "reasoning":
+                truncated_code = code_str[:500] + "\n...[truncated]...\n" + code_str[-(max_len-500):]
+            else:
+                truncated_code = code_str[-max_len:]
+        else:
+            truncated_code = code_str
+        if task_type == "code":
+            prompt_template = f"""
+        You are an expert programming contest judge. Your task is to evaluate a generated solution for a given problem based on correctness, efficiency, and adherence to constraints.
+        [Problem Statement]
+        {prompt}
+        [/Problem Statement]
+        [Proposed Python Solution]
+        ```python
+        {truncated_code}
+        ```
+        [/Proposed Python Solution]
+        **Analysis Steps:**
+        1. Correctness: Does the core algorithm correctly solve the problem?
+        2. Efficiency: Is the time complexity acceptable for the given constraints?
+        3. Edge Cases & Constraints: Does the code handle all rules and edge cases?
+        **Conclusion**: Based on your analysis, is the solution likely to be fully correct? Answer with a single word: Yes or No.
+        **Answer:** """
+        elif task_type == "math":
+            prompt_template = f"""
+        You are an expert mathematician and competition judge. Your task is to evaluate a proposed mathematical solution for a given problem based on its logical rigor and accuracy.
+        [Math Problem]
+        {prompt}
+        [/Math Problem]
+        [Proposed Mathematical Solution]
+        {truncated_code}
+        [/Proposed Mathematical Solution]
+        **Analysis Steps:**
+        1. Reasoning Validity: Are the logical steps and mathematical properties applied correctly?
+        2. Calculation Accuracy: Are the intermediate calculations or algebraic manipulations accurate?
+        3. Goal Alignment: Does the current reasoning path directly lead toward the final answer required by the problem?
+        **Conclusion**: Based on your analysis, is this solution path sound and likely to result in the correct final answer? Answer with a single word: Yes or No.
+        **Answer:** """
+        elif task_type == "reasoning":
+            prompt_template = f"""
+        You are an expert reading comprehension and faithfulness judge. Your task is to evaluate a generated answer based on the provided context and question.
+        [Context and Question]
+        {prompt}
+        [/Context and Question]
+        [Proposed Answer]
+        {truncated_code}
+        [/Proposed Answer]
+        **Analysis Steps :**
+        1. Faithfulness: Is the answer an exact, literal span from the context?
+        2. Relevance: Does the answer directly address the specific question asked without hallucinating external information?
+        3. Accuracy: Does the provided context strictly support this answer?
+        **Conclusion**: Based on your analysis, is the answer fully faithful to the context and correct? Answer with a single word: Yes or No.
+        **Answer:** """
+        else:
+            prompt_template = f"Is the following answer correct?\nQuestion: {prompt}\nAnswer: {truncated_code}\nAnswer Yes or No.\nAnswer:"
+        verify_text = textwrap.dedent(prompt_template).strip()
+        input_ids = self.tokenizer(verify_text, return_tensors="pt").input_ids.to(self.device)
+        max_pos = getattr(self.model.config, "max_position_embeddings",
+                  getattr(self.model.config, "n_positions",
+                  getattr(self.model.config, "max_sequence_length", 20480)))
+        if input_ids.shape[1] > max_pos - 16:
+            logger.warning("Verifier input is too long, truncating from the left.")
+            input_ids = input_ids[:, -(max_pos - 16):]
+        with torch.no_grad():
+            outputs = self.model(input_ids)
+            logits = outputs.logits[0, -1, :]
+            yes_score = max((logits[i].item() for i in self.yes_ids if i < logits.shape[-1]), default=-float('inf'))
+            no_score = max((logits[i].item() for i in self.no_ids if i < logits.shape[-1]), default=-float('inf'))
+            if yes_score == -float('inf') and no_score == -float('inf'): return 0.5
+            probs = torch.softmax(torch.tensor([yes_score, no_score]), dim=0)
+            return probs[0].item()
+    def get_reward(self, prompt, code_str, mode="confidence", problem_data=None, current_logits=None, task_type="code"):
+        if mode == "svf":
+            return self.svf_score(prompt, code_str, task_type=task_type)
+        else:
+            return self.compute_confidence(current_logits)

Prism/LLaDA/LLaDA_Baseline/dllm_eval/prompts/__init__.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import ast
+import logging
+import os
+from typing import Dict
+from dllm_eval import utils
+eval_logger = logging.getLogger(__name__)
+# Prompt library.
+# Stores prompts in a dictionary indexed by 2 levels:
+# prompt category name, and prompt name.
+# This allows us to access prompts
+PROMPT_REGISTRY: Dict[str, Dict[str, str]] = {
+    "qa-basic": {
+        "question-newline-answer": "Question: {{question}}\nAnswer:",
+        "q-newline-a": "Q: {{question}}\nA:",
+    },
+}
+def get_prompt(prompt_id: str, dataset_name: str = None, subset_name: str = None):
+    # unpack prompt name
+    category_name, prompt_name = prompt_id.split(":")
+    if subset_name is None:
+        dataset_full_name = dataset_name
+    else:
+        dataset_full_name = f"{dataset_name}-{subset_name}"
+    eval_logger.info(f"Loading prompt from {category_name} for {dataset_full_name}")
+    if category_name == "promptsource":
+        try:
+            from promptsource.templates import DatasetTemplates
+        except ModuleNotFoundError as exception:
+            raise type(exception)(
+                "Tried to load a Promptsource template, but promptsource is not installed ",
+                "please install promptsource via pip install lm-eval[promptsource] or pip install -e .[promptsource]",
+            )
+        try:
+            if subset_name is None:
+                prompts = DatasetTemplates(dataset_name=dataset_name)
+            else:
+                prompts = DatasetTemplates(
+                    dataset_name=dataset_name, subset_name=subset_name
+                )
+        except Exception:
+            raise ValueError(f"{dataset_name} and {subset_name} not found")
+        if prompt_name in prompts.all_template_names:
+            return prompts[prompt_name]
+        else:
+            raise ValueError(
+                f"{prompt_name} not in prompt list {prompts.all_template_names}"
+            )
+    elif ".yaml" in category_name:
+        import yaml
+        with open(category_name, "rb") as file:
+            prompt_yaml_file = yaml.full_load(file)
+        prompt_string = prompt_yaml_file["prompts"][prompt_name]
+        return PromptString(prompt_string)
+    else:
+        try:
+            return PROMPT_REGISTRY[category_name][prompt_name]
+        except Exception:
+            raise ValueError(
+                f"expected only a single `:` as separator between \
+                prompt category and name, but got `{prompt_id}` instead"
+            )
+def load_prompt_list(
+    use_prompt: str, dataset_name=None, subset_name=None, yaml_path=None, **kwargs
+):
+    category_name, prompt_name = use_prompt.split(":")
+    if category_name == "promptsource":
+        from promptsource.templates import DatasetTemplates
+        if subset_name is None:
+            prompts = DatasetTemplates(dataset_name=dataset_name)
+        else:
+            prompts = DatasetTemplates(
+                dataset_name=dataset_name, subset_name=subset_name
+            )
+        prompt_list = utils.pattern_match(prompt_name, prompts.all_template_names)
+    elif ".yaml" in category_name:
+        import yaml
+        if yaml_path is not None:
+            category_name = os.path.realpath(os.path.join(yaml_path, category_name))
+        with open(category_name, "rb") as file:
+            prompt_yaml_file = yaml.full_load(file)
+        prompt_list = utils.pattern_match(
+            prompt_name, prompt_yaml_file["prompts"].keys()
+        )
+    # category_name, *prompt_name = use_prompt.split(":")
+    # TODO allow to multiple prompt naming
+    # if len(prompt_name) > 1:
+    #     prompt_list = []
+    #     for prompt in prompt_name:
+    #         prompt_list.append(utils.pattern_match(prompt_name, prompts.all_template_names))
+    # else:
+    #     prompt_list = utils.pattern_match(prompt_name, prompts.all_template_names)
+    return [":".join([category_name, prompt]) for prompt in prompt_list]
+class PromptString:
+    def __init__(self, prompt_string):
+        self.prompt_string = prompt_string
+    def apply(self, doc):
+        doc_to_text = self.prompt_string["doc_to_text"]
+        doc_to_target = self.prompt_string["doc_to_target"]
+        # TODO need a way to process doc_to_choice
+        if "doc_to_choice" in self.prompt_string:
+            raise NotImplementedError("Not yet implemented to accept doc_to_choice")
+        text_string = utils.apply_template(doc_to_text, doc)
+        target_string = utils.apply_template(doc_to_target, doc)
+        return [text_string, target_string]

Prism/LLaDA/LLaDA_Baseline/dllm_eval/tasks/__init__.py ADDED Viewed

	@@ -0,0 +1,670 @@

+import collections
+import inspect
+import logging
+import os
+from functools import partial
+from typing import Dict, List, Mapping, Optional, Union
+from dllm_eval import utils
+from dllm_eval.api.group import ConfigurableGroup, GroupConfig
+from dllm_eval.api.task import ConfigurableTask, Task
+from dllm_eval.evaluator_utils import get_subtask_list
+GROUP_ONLY_KEYS = list(GroupConfig().to_dict().keys())
+eval_logger = logging.getLogger(__name__)
+class TaskManager:
+    """TaskManager indexes all tasks from the default `dllm_eval/tasks/`
+    and an optional directory if provided.
+    """
+    def __init__(
+        self,
+        verbosity: Optional[str] = None,
+        include_path: Optional[Union[str, List]] = None,
+        include_defaults: bool = True,
+        metadata: Optional[dict] = None,
+    ) -> None:
+        if verbosity is not None:
+            utils.setup_logging(verbosity)
+        self.include_path = include_path
+        self.metadata = metadata
+        self._task_index = self.initialize_tasks(
+            include_path=include_path, include_defaults=include_defaults
+        )
+        self._all_tasks = sorted(list(self._task_index.keys()))
+        self._all_groups = sorted(
+            [x for x in self._all_tasks if self._task_index[x]["type"] == "group"]
+        )
+        self._all_subtasks = sorted(
+            [
+                x
+                for x in self._all_tasks
+                if self._task_index[x]["type"] in ["task", "python_task"]
+            ]
+        )
+        self._all_tags = sorted(
+            [x for x in self._all_tasks if self._task_index[x]["type"] == "tag"]
+        )
+        self.task_group_map = collections.defaultdict(list)
+    def initialize_tasks(
+        self,
+        include_path: Optional[Union[str, List]] = None,
+        include_defaults: bool = True,
+    ) -> dict[str, dict]:
+        """Creates a dictionary of tasks indexes.
+        :param include_path: Union[str, List] = None
+            An additional path to be searched for tasks recursively.
+            Can provide more than one such path as a list.
+        :param include_defaults: bool = True
+            If set to false, default tasks (those in dllm_eval/tasks/) are not indexed.
+        return
+            Dictionary of task names as key and task metadata
+        """
+        if include_defaults:
+            all_paths = [os.path.dirname(os.path.abspath(__file__)) + "/"]
+        else:
+            all_paths = []
+        if include_path is not None:
+            if isinstance(include_path, str):
+                include_path = [include_path]
+            all_paths.extend(include_path)
+        task_index = {}
+        for task_dir in all_paths:
+            tasks = self._get_task_and_group(task_dir)
+            task_index = {**tasks, **task_index}
+        return task_index
+    @property
+    def all_tasks(self):
+        return self._all_tasks
+    @property
+    def all_groups(self):
+        return self._all_groups
+    @property
+    def all_subtasks(self):
+        return self._all_subtasks
+    @property
+    def all_tags(self):
+        return self._all_tags
+    @property
+    def task_index(self):
+        return self._task_index
+    def list_all_tasks(
+        self, list_groups=True, list_tags=True, list_subtasks=True
+    ) -> str:
+        from pytablewriter import MarkdownTableWriter
+        def sanitize_path(path):
+            # don't print full path if we are within the dllm_eval/tasks dir !
+            # if we aren't though, provide the full path.
+            if "dllm_eval/tasks/" in path:
+                return "dllm_eval/tasks/" + path.split("dllm_eval/tasks/")[-1]
+            else:
+                return path
+        group_table = MarkdownTableWriter()
+        group_table.headers = ["Group", "Config Location"]
+        gt_values = []
+        for g in self.all_groups:
+            path = self.task_index[g]["yaml_path"]
+            if path == -1:
+                path = "---"
+            else:
+                path = sanitize_path(path)
+            gt_values.append([g, path])
+        group_table.value_matrix = gt_values
+        tag_table = MarkdownTableWriter()
+        tag_table.headers = ["Tag"]
+        tag_table.value_matrix = [[t] for t in self.all_tags]
+        subtask_table = MarkdownTableWriter()
+        subtask_table.headers = ["Task", "Config Location", "Output Type"]
+        st_values = []
+        for t in self.all_subtasks:
+            path = self.task_index[t]["yaml_path"]
+            output_type = ""
+            # read the yaml file to determine the output type
+            if path != -1:
+                config = utils.load_yaml_config(path, mode="simple")
+                if "output_type" in config:
+                    output_type = config["output_type"]
+                elif (
+                    "include" in config
+                ):  # if no output type, check if there is an include with an output type
+                    include_path = path.split("/")[:-1] + config["include"]
+                    include_config = utils.load_yaml_config(include_path, mode="simple")
+                    if "output_type" in include_config:
+                        output_type = include_config["output_type"]
+            if path == -1:
+                path = "---"
+            else:
+                path = sanitize_path(path)
+            st_values.append([t, path, output_type])
+        subtask_table.value_matrix = st_values
+        result = "\n"
+        if list_groups:
+            result += group_table.dumps() + "\n\n"
+        if list_tags:
+            result += tag_table.dumps() + "\n\n"
+        if list_subtasks:
+            result += subtask_table.dumps() + "\n\n"
+        return result
+    def match_tasks(self, task_list: list[str]) -> list[str]:
+        return utils.pattern_match(task_list, self.all_tasks)
+    def _name_is_registered(self, name: str) -> bool:
+        if name in self.all_tasks:
+            return True
+        return False
+    def _name_is_task(self, name: str) -> bool:
+        if self._name_is_registered(name) and (self.task_index[name]["type"] == "task"):
+            return True
+        return False
+    def _name_is_tag(self, name: str) -> bool:
+        if self._name_is_registered(name) and (self.task_index[name]["type"] == "tag"):
+            return True
+        return False
+    def _name_is_group(self, name: str) -> bool:
+        if self._name_is_registered(name) and (
+            self.task_index[name]["type"] == "group"
+        ):
+            return True
+        return False
+    def _name_is_python_task(self, name: str) -> bool:
+        if self._name_is_registered(name) and (
+            self.task_index[name]["type"] == "python_task"
+        ):
+            return True
+        return False
+    def _config_is_task(self, config: dict) -> bool:
+        if ("task" in config) and isinstance(config["task"], str):
+            return True
+        return False
+    def _config_is_group(self, config: dict) -> bool:
+        if ("task" in config) and isinstance(config["task"], list):
+            return True
+        return False
+    def _config_is_python_task(self, config: dict) -> bool:
+        if "class" in config:
+            return True
+        return False
+    def _get_yaml_path(self, name: str):
+        if name not in self.task_index:
+            raise ValueError
+        return self.task_index[name]["yaml_path"]
+    def _get_config(self, name):
+        if name not in self.task_index:
+            raise ValueError
+        yaml_path = self._get_yaml_path(name)
+        if yaml_path == -1:
+            return {}
+        else:
+            return utils.load_yaml_config(yaml_path, mode="full")
+    def _get_tasklist(self, name):
+        if self._name_is_task(name):
+            raise ValueError
+        return self.task_index[name]["task"]
+    def _process_alias(self, config, group=None):
+        # If the group is not the same as the original
+        # group which the group alias was intended for,
+        # Set the group_alias to None instead.
+        if ("group_alias" in config) and ("group" in config) and group is not None:
+            if config["group"] != group:
+                config["group_alias"] = None
+        return config
+    def _class_has_config_in_constructor(self, cls):
+        constructor = getattr(cls, "__init__", None)
+        return (
+            "config" in inspect.signature(constructor).parameters
+            if constructor
+            else False
+        )
+    def _load_individual_task_or_group(
+        self,
+        name_or_config: Optional[Union[str, dict]] = None,
+        parent_name: Optional[str] = None,
+        update_config: Optional[dict] = None,
+    ) -> Mapping:
+        def _load_task(config, task):
+            if "include" in config:
+                config = {
+                    **utils.load_yaml_config(
+                        yaml_path=None,
+                        yaml_config={"include": config.pop("include")},
+                        mode="full",
+                    ),
+                    **config,
+                }
+            if self._config_is_python_task(config):
+                if self._class_has_config_in_constructor(config["class"]):
+                    task_object = config["class"](config=config)
+                else:
+                    task_object = config["class"]()
+                if isinstance(task_object, ConfigurableTask):
+                    # very scuffed: set task name here. TODO: fixme?
+                    task_object.config.task = task
+            else:
+                if self.metadata is not None:
+                    config["metadata"] = config.get("metadata", {}) | self.metadata
+                else:
+                    config["metadata"] = config.get("metadata", {})
+                task_object = ConfigurableTask(config=config)
+            return {task: task_object}
+        def _get_group_and_subtask_from_config(
+            config: dict,
+        ) -> tuple[ConfigurableGroup, list[str]]:
+            if self.metadata is not None:
+                config["metadata"] = config.get("metadata", {}) | self.metadata
+            group_name = ConfigurableGroup(config=config)
+            subtask_list = []
+            for task in group_name.config["task"]:
+                if isinstance(task, str) and self._name_is_tag(task):
+                    subtask_list.extend(self._get_tasklist(task))
+                else:
+                    subtask_list.append(task)
+            return group_name, subtask_list
+        def _process_group_config(
+            config: dict, update_config: dict = None
+        ) -> tuple[dict, dict]:
+            if update_config is not None:
+                config = {**config, **update_config}
+            _update_config = {
+                k: v for k, v in config.items() if k not in GROUP_ONLY_KEYS
+            }
+            if not bool(_update_config):
+                _update_config = None
+            group_config = {k: v for k, v in config.items() if k in GROUP_ONLY_KEYS}
+            return group_config, _update_config
+        if isinstance(name_or_config, str):
+            if update_config is not None:
+                # Process name_or_config as a dict instead
+                name_or_config = {"task": name_or_config, **update_config}
+            elif self._name_is_task(name_or_config) or self._name_is_python_task(
+                name_or_config
+            ):
+                task_config = self._get_config(name_or_config)
+                return _load_task(task_config, task=name_or_config)
+            else:
+                subtask_list = self._get_tasklist(name_or_config)
+                if subtask_list == -1:
+                    group_config = self._get_config(name_or_config)
+                    group_config, update_config = _process_group_config(group_config)
+                    group_name, subtask_list = _get_group_and_subtask_from_config(
+                        group_config
+                    )
+                else:
+                    if self._name_is_tag(name_or_config):
+                        fn = partial(
+                            self._load_individual_task_or_group,
+                            update_config=name_or_config
+                            if isinstance(name_or_config, dict)
+                            else None,
+                        )
+                        return dict(
+                            collections.ChainMap(*map(fn, reversed(subtask_list)))
+                        )
+                    else:
+                        group_name = ConfigurableGroup(
+                            config={"group": name_or_config, "task": subtask_list}
+                        )
+        if isinstance(name_or_config, dict):
+            if self._config_is_task(name_or_config):
+                name = name_or_config.pop("task")
+                if update_config is not None:
+                    name_or_config = {**name_or_config, **update_config}
+                # If the name is registered as a group
+                if self._name_is_group(name):
+                    group_config = self._get_config(name)
+                    group_config, update_config = _process_group_config(
+                        group_config, name_or_config
+                    )
+                    group_name, subtask_list = _get_group_and_subtask_from_config(
+                        group_config
+                    )
+                elif self._name_is_tag(name):
+                    subtask_list = self._get_tasklist(name)
+                    fn = partial(
+                        self._load_individual_task_or_group,
+                        update_config=name_or_config,
+                    )
+                    return dict(collections.ChainMap(*map(fn, reversed(subtask_list))))
+                else:
+                    if self._name_is_registered(name):
+                        base_task_config = self._get_config(name)
+                        # Check if this is a duplicate.
+                        if parent_name is not None:
+                            num_duplicate = len(
+                                list(
+                                    filter(
+                                        lambda x: x.startswith(name),
+                                        self.task_group_map[parent_name],
+                                    )
+                                )
+                            )
+                            if num_duplicate > 0:
+                                name = f"{name}-{num_duplicate}"
+                            self.task_group_map[parent_name].append(name)
+                        task_config = {
+                            **base_task_config,
+                            **name_or_config,
+                        }
+                    else:
+                        task_config = name_or_config
+                    return _load_task(task_config, task=name)
+            else:
+                group_config, update_config = _process_group_config(name_or_config)
+                group_name, subtask_list = _get_group_and_subtask_from_config(
+                    group_config
+                )
+        fn = partial(
+            self._load_individual_task_or_group,
+            parent_name=group_name,
+            update_config=update_config,
+        )
+        return {
+            group_name: dict(collections.ChainMap(*map(fn, reversed(subtask_list))))
+        }
+    def load_task_or_group(self, task_list: Optional[Union[str, list]] = None) -> dict:
+        """Loads a dictionary of task objects from a list
+        :param task_list: Union[str, list] = None
+            Single string or list of string of task names to be loaded
+        :return
+            Dictionary of task objects
+        """
+        if isinstance(task_list, str):
+            task_list = [task_list]
+        all_loaded_tasks = dict(
+            collections.ChainMap(
+                *map(
+                    lambda task: self._load_individual_task_or_group(task),
+                    task_list,
+                )
+            )
+        )
+        return all_loaded_tasks
+    def load_config(self, config: Dict):
+        return self._load_individual_task_or_group(config)
+    def _get_task_and_group(self, task_dir: str):
+        """Creates a dictionary of tasks index with the following metadata,
+        - `type`, that can be either `task`, `python_task`, `group` or `tags`.
+            `task` refer to regular task configs, `python_task` are special
+            yaml files that only consists of `task` and `class` parameters.
+            `group` are group configs. `tags` are labels that can be assigned
+            to tasks to assist in sorting and calling tasks of certain themes.
+        - `yaml_path`, path to the yaml file. If the entry is a `group` that
+            was configured through a task config, the yaml_path will be -1
+            and all subtasks will be listed in `task` (see below)
+        - `task`, reserved for entries with `type` as `group`. This will list
+            all subtasks. When a group config is created (as opposed to task
+            config having `group` parameter set), this will be set to -1 to
+            avoid recursive indexing. The whole list of subtasks will be loaded
+            at evaluation.
+        :param task_dir: str
+            A directory to check for tasks
+        :return
+            Dictionary of task names as key and task metadata
+        """
+        def _populate_tags_and_groups(config, task, tasks_and_groups, print_info):
+            # TODO: remove group in next release
+            if "tag" in config:
+                attr_list = config["tag"]
+                if isinstance(attr_list, str):
+                    attr_list = [attr_list]
+                for tag in attr_list:
+                    if tag not in tasks_and_groups:
+                        tasks_and_groups[tag] = {
+                            "type": "tag",
+                            "task": [task],
+                            "yaml_path": -1,
+                        }
+                    elif tasks_and_groups[tag]["type"] != "tag":
+                        eval_logger.info(
+                            f"The tag '{tag}' is already registered as a group, this tag will not be registered. "
+                            "This may affect tasks you want to call."
+                        )
+                        break
+                    else:
+                        tasks_and_groups[tag]["task"].append(task)
+        # TODO: remove group in next release
+        print_info = True
+        ignore_dirs = [
+            "__pycache__",
+            ".ipynb_checkpoints",
+        ]
+        tasks_and_groups = collections.defaultdict()
+        for root, dirs, file_list in os.walk(task_dir):
+            dirs[:] = [d for d in dirs if d not in ignore_dirs]
+            for f in file_list:
+                if f.endswith(".yaml"):
+                    yaml_path = os.path.join(root, f)
+                    print(yaml_path)
+                    config = utils.load_yaml_config(yaml_path, mode="simple")
+                    if self._config_is_python_task(config):
+                        # This is a python class config
+                        task = config["task"]
+                        tasks_and_groups[task] = {
+                            "type": "python_task",
+                            "yaml_path": yaml_path,
+                        }
+                        _populate_tags_and_groups(
+                            config, task, tasks_and_groups, print_info
+                        )
+                    elif self._config_is_group(config):
+                        # This is a group config
+                        tasks_and_groups[config["group"]] = {
+                            "type": "group",
+                            "task": -1,  # This signals that
+                            # we don't need to know
+                            # the task list for indexing
+                            # as it can be loaded
+                            # when called.
+                            "yaml_path": yaml_path,
+                        }
+                        # # Registered the level 1 tasks from a group config
+                        # for config in config["task"]:
+                        #     if isinstance(config, dict) and self._config_is_task(config):
+                        #         task = config["task"]
+                        #         tasks_and_groups[task] = {
+                        #             "type": "task",
+                        #             "yaml_path": yaml_path,
+                        #             }
+                    elif self._config_is_task(config):
+                        # This is a task config
+                        task = config["task"]
+                        tasks_and_groups[task] = {
+                            "type": "task",
+                            "yaml_path": yaml_path,
+                        }
+                        _populate_tags_and_groups(
+                            config, task, tasks_and_groups, print_info
+                        )
+                    else:
+                        eval_logger.debug(f"File {f} in {root} could not be loaded")
+        return tasks_and_groups
+def get_task_name_from_config(task_config: Dict[str, str]) -> str:
+    if "task" in task_config:
+        return task_config["task"]
+    if "dataset_name" in task_config:
+        return "{dataset_path}_{dataset_name}".format(**task_config)
+    else:
+        return "{dataset_path}".format(**task_config)
+def get_task_name_from_object(task_object):
+    if hasattr(task_object, "config"):
+        return task_object._config["task"]
+    # TODO: scrap this
+    # this gives a mechanism for non-registered tasks to have a custom name anyways when reporting
+    return (
+        task_object.EVAL_HARNESS_NAME
+        if hasattr(task_object, "EVAL_HARNESS_NAME")
+        else type(task_object).__name__
+    )
+def _check_duplicates(task_dict: dict) -> None:
+    """helper function solely used in validating get_task_dict output.
+    Takes the output of dllm_eval.evaluator_utils.get_subtask_list and
+    returns a list of all leaf subtasks contained within, and errors if any such leaf subtasks are
+    "oversubscribed" to several disjoint groups.
+    """
+    subtask_names = []
+    for key, value in task_dict.items():
+        subtask_names.extend(value)
+    duplicate_tasks = {
+        task_name for task_name in subtask_names if subtask_names.count(task_name) > 1
+    }
+    # locate the potentially problematic groups that seem to 'compete' for constituent subtasks
+    competing_groups = [
+        group
+        for group in task_dict.keys()
+        if len(set(task_dict[group]).intersection(duplicate_tasks)) > 0
+    ]
+    if len(duplicate_tasks) > 0:
+        raise ValueError(
+            f"Found 1 or more tasks while trying to call get_task_dict() that were members of more than 1 called group: {list(duplicate_tasks)}. Offending groups: {competing_groups}. Please call groups which overlap their constituent tasks in separate evaluation runs."
+        )
+def get_task_dict(
+    task_name_list: Union[str, List[Union[str, Dict, Task]]],
+    task_manager: Optional[TaskManager] = None,
+):
+    """Creates a dictionary of task objects from either a name of task, config, or prepared Task object.
+    :param task_name_list: List[Union[str, Dict, Task]]
+        Name of model or LM object, see dllm_eval.models.get_model
+    :param task_manager: TaskManager = None
+        A TaskManager object that stores indexed tasks. If not set,
+        task_manager will load one. This should be set by the user
+        if there are additional paths that want to be included
+        via `include_path`
+    :return
+        Dictionary of task objects
+    """
+    task_name_from_string_dict = {}
+    task_name_from_config_dict = {}
+    task_name_from_object_dict = {}
+    if isinstance(task_name_list, str):
+        task_name_list = [task_name_list]
+    elif isinstance(task_name_list, list):
+        if not all([isinstance(task, (str, dict, Task)) for task in task_name_list]):
+            raise TypeError(
+                "Expected all list items to be of types 'str', 'dict', or 'Task', but at least one entry did not match."
+            )
+    else:
+        raise TypeError(
+            f"Expected a 'str' or 'list' but received {type(task_name_list)}."
+        )
+    string_task_name_list = [task for task in task_name_list if isinstance(task, str)]
+    others_task_name_list = [
+        task for task in task_name_list if not isinstance(task, str)
+    ]
+    if len(string_task_name_list) > 0:
+        if task_manager is None:
+            task_manager = TaskManager()
+        task_name_from_string_dict = task_manager.load_task_or_group(
+            string_task_name_list
+        )
+    for task_element in others_task_name_list:
+        if isinstance(task_element, dict):
+            task_name_from_config_dict = {
+                **task_name_from_config_dict,
+                **task_manager.load_config(config=task_element),
+            }
+        elif isinstance(task_element, Task):
+            task_name_from_object_dict = {
+                **task_name_from_object_dict,
+                get_task_name_from_object(task_element): task_element,
+            }
+    if not set(task_name_from_string_dict.keys()).isdisjoint(
+        set(task_name_from_object_dict.keys())
+    ):
+        raise ValueError
+    final_task_dict = {
+        **task_name_from_string_dict,
+        **task_name_from_config_dict,
+        **task_name_from_object_dict,
+    }
+    # behavior can get odd if one tries to invoke several groups that "compete" for the same task.
+    # (notably, because one could request several num_fewshot values at once in GroupConfig overrides for the subtask
+    # and we'd be unsure which to use and report.)
+    # we explicitly check and error in this case.
+    _check_duplicates(get_subtask_list(final_task_dict))
+    return final_task_dict

Prism/LLaDA/LLaDA_Baseline/dllm_eval/tasks/gsm8k/gsm8k.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+task: gsm8k
+dataset_path: openai/gsm8k
+dataset_name: main
+output_type: generate_until
+training_split: train
+fewshot_split: train
+test_split: test
+doc_to_text: !function utils.gsm_prompt
+doc_to_target: "{{answer.split('####')[-1].strip()}}"
+generation_kwargs:
+  until:
+    - "[NO_UNTIL_PLACEHOLDER]"
+  do_sample: false
+repeats: 1
+num_fewshot: 0

Prism/LLaDA/LLaDA_Baseline/dllm_eval/tasks/gsm8k/utils.py ADDED Viewed

	@@ -0,0 +1,13 @@

+def gsm_prompt(doc):
+  system_prompt = (
+      "You are a math expert. You will be given a question to solve. Solve it step by step. Wrap the final answer in a \\boxed{}. \n"
+      "Respond in the following format:\n"
+      "<reasoning>\n"
+      "Your reasoning here\n"
+      "</reasoning>\n"
+      "<answer>\n"
+      "\\boxed{...}\n"
+      "</answer>"
+  )
+  prompt = f"{system_prompt}\n\n{doc['question']}\n\n"
+  return prompt

Prism/LLaDA/LLaDA_Baseline/dllm_eval/tasks/humaneval/humaneval.yaml ADDED Viewed

	@@ -0,0 +1,13 @@

+task: humaneval
+dataset_path: openai/openai_humaneval
+unsafe_code: true
+output_type: generate_until
+test_split: test
+doc_to_text: "Write a solution to the following problem and make sure that it passes the tests:\n{{prompt}}\n\nFirst, reason about the solution step-by-step. Then, write the code.\nRespond in the following format:\n<reasoning>\nYour reasoning here\n</reasoning>\n<answer>\n```python\nThe complete implementation  of the {{entry_point}} function\n```\n</answer>"
+doc_to_target: "{{test}}\ncheck({{entry_point}})"
+generation_kwargs:
+  until:
+    - "[NO_UNTIL_PLACEHOLDER]"
+  do_sample: false
+repeats: 1
+num_fewshot: 0

Prism/LLaDA/LLaDA_Baseline/dllm_eval/tasks/humaneval/utils.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import evaluate as hf_evaluate
+try:
+    compute_ = hf_evaluate.load("code_eval")
+    test_cases = ["assert add(2, 3)==5"]
+    candidates = [["def add(a,b): return a*b"]]
+    results = compute_.compute(references=test_cases, predictions=candidates, k=[1])
+except Exception as e:
+    raise e
+def pass_at_k(references: list[str], predictions: list[list[str]], k: list[int] = None):
+    global compute_
+    assert k is not None
+    if isinstance(k, int):
+        k = [k]
+    res = compute_.compute(
+        references=references,
+        predictions=predictions,
+        k=k
+    )
+    return res[0]
+def clean_response_string(r: str) -> str:
+    cleaned_text = r if r.rfind("```python") == -1 else r[r.rfind("```python"):]
+    cleaned_text = cleaned_text if cleaned_text.rfind("```") == -1 else cleaned_text[: cleaned_text.rfind("```")]
+    cleaned_text = cleaned_text if cleaned_text.rfind("if __name__ == \"__main__\":") == -1 else cleaned_text[: cleaned_text.rfind("if __name__ == \"__main__\":")]
+    return cleaned_text
+def build_predictions(resps: list[list[str]], docs: list[dict]) -> list[list[str]]:
+    return [[doc["prompt"] + r for r in resp] for resp, doc in zip(resps, docs)]
+def build_predictions(
+    resps: list[list[str]], docs: list[dict]
+) -> list[list[str]]:
+    return [
+        [clean_response_string(r) for r in resp]
+        for resp, doc in zip(resps, docs)
+    ]

Prism/LLaDA/LLaDA_Baseline/dllm_eval/tasks/mbpp/mbpp.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+task: mbpp
+dataset_path: google-research-datasets/mbpp
+dataset_name: full
+unsafe_code: true
+output_type: generate_until
+test_split: test
+doc_to_text: "\n{{text}} Your code should pass these tests:\n\n{{test_list[0]}}\n{{test_list[1]}}\n{{test_list[2]}} \n\nFirst, reason about the solution step-by-step. Then, write the code.\nRespond in the following format:\n<reasoning>\nYour reasoning here\n</reasoning>\n<answer>\n```python\nThe complete implementation of the function\n```\n</answer>"
+doc_to_target: "{% if is_fewshot is defined %}{{code}}\n[DONE]{% else %}{{test_list[0]}}\n{{test_list[1]}}\n{{test_list[2]}}{% endif %}"
+target_delimiter: ""
+generation_kwargs:
+  until:
+    - "[NO_UNTIL_PLACEHOLDER]"
+  do_sample: false
+num_fewshot: 0

Prism/LLaDA/LLaDA_Baseline/dllm_eval/tasks/mbpp/utils.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import re
+from typing import Union
+import evaluate as hf_evaluate
+try:
+    pass_at_k = hf_evaluate.load("code_eval")
+    # run simple test to check code execution is enabled before model generation
+    test_cases = ["assert add(2, 3)==5"]
+    candidates = [["def add(a,b): return a*b"]]
+    results = pass_at_k.compute(references=test_cases, predictions=candidates, k=[1])
+except Exception as e:
+    raise e
+def pass_at_1(
+    references: Union[str, list[str]], predictions: Union[str, list[list[str]]]
+) -> float:
+    if isinstance(references, str):
+        references = [references]
+    if isinstance(predictions[0], str):
+        predictions = [[p] for p in predictions]
+    return pass_at_k.compute(
+        references=references,
+        predictions=predictions,
+        k=[1],
+        num_workers=48
+    )[0]["pass@1"]
+def extract_code_blocks(text: str) -> str:
+    text = re.sub(r"\[DONE\]", "", text)
+    text = re.sub(r"<\|eot_id\|>", "", text)
+    text = re.sub(r"<\|endoftext\|>", "", text)
+    return text
+def build_predictions(resps: list[list[str]], docs: list[dict]) -> list[list[str]]:
+    return [[extract_code_blocks(r) for r in resp] for resp in resps]
+def list_fewshot_samples():
+    return [
+        {
+            "task_id": 2,
+            "text": "Write a function to find the similar elements from the given two tuple lists.",
+            "code": "def similar_elements(test_tup1, test_tup2):\r\n  res = tuple(set(test_tup1) & set(test_tup2))\r\n  return (res) ",
+            "test_list": [
+                "assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)",
+                "assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4)",
+                "assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14)",
+            ],
+            "is_fewshot": True,
+        },
+        {
+            "task_id": 3,
+            "text": "Write a python function to identify non-prime numbers.",
+            "code": "import math\r\ndef is_not_prime(n):\r\n    result = False\r\n    for i in range(2,int(math.sqrt(n)) + 1):\r\n        if n % i == 0:\r\n            result = True\r\n    return result",
+            "test_list": [
+                "assert is_not_prime(2) == False",
+                "assert is_not_prime(10) == True",
+                "assert is_not_prime(35) == True",
+            ],
+            "is_fewshot": True,
+        },
+        {
+            "task_id": 4,
+            "text": "Write a function to find the largest integers from a given list of numbers using heap queue algorithm.",
+            "code": "import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n  largest_nums = hq.nlargest(n, nums)\r\n  return largest_nums",
+            "test_list": [
+                "assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] ",
+                "assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] ",
+                "assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35]",
+            ],
+            "is_fewshot": True,
+        },
+    ]

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/certifi/__main__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import argparse
+from certifi import contents, where
+parser = argparse.ArgumentParser()
+parser.add_argument("-c", "--contents", action="store_true")
+args = parser.parse_args()
+if args.contents:
+    print(contents())
+else:
+    print(where())

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/frozenlist-1.5.0.dist-info/INSTALLER ADDED Viewed

	@@ -0,0 +1 @@


1	+ pip

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/frozenlist-1.5.0.dist-info/LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2013-2019 Nikolay Kim and Andrew Svetlov
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/frozenlist-1.5.0.dist-info/METADATA ADDED Viewed

	@@ -0,0 +1,477 @@

+Metadata-Version: 2.1
+Name: frozenlist
+Version: 1.5.0
+Summary: A list-like structure which implements collections.abc.MutableSequence
+Home-page: https://github.com/aio-libs/frozenlist
+Maintainer: aiohttp team <team@aiohttp.org>
+Maintainer-email: team@aiohttp.org
+License: Apache 2
+Project-URL: Chat: Matrix, https://matrix.to/#/#aio-libs:matrix.org
+Project-URL: Chat: Matrix Space, https://matrix.to/#/#aio-libs-space:matrix.org
+Project-URL: CI: Github Actions, https://github.com/aio-libs/frozenlist/actions
+Project-URL: Code of Conduct, https://github.com/aio-libs/.github/blob/master/CODE_OF_CONDUCT.md
+Project-URL: Coverage: codecov, https://codecov.io/github/aio-libs/frozenlist
+Project-URL: Docs: Changelog, https://github.com/aio-libs/frozenlist/blob/master/CHANGES.rst#changelog
+Project-URL: Docs: RTD, https://frozenlist.aio-libs.org
+Project-URL: GitHub: issues, https://github.com/aio-libs/frozenlist/issues
+Project-URL: GitHub: repo, https://github.com/aio-libs/frozenlist
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: Apache Software License
+Classifier: Operating System :: POSIX
+Classifier: Operating System :: MacOS :: MacOS X
+Classifier: Operating System :: Microsoft :: Windows
+Classifier: Programming Language :: Cython
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Programming Language :: Python :: Implementation :: CPython
+Classifier: Programming Language :: Python :: Implementation :: PyPy
+Requires-Python: >=3.8
+Description-Content-Type: text/x-rst
+License-File: LICENSE
+frozenlist
+==========
+.. image:: https://github.com/aio-libs/frozenlist/workflows/CI/badge.svg
+   :target: https://github.com/aio-libs/frozenlist/actions
+   :alt: GitHub status for master branch
+.. image:: https://codecov.io/gh/aio-libs/frozenlist/branch/master/graph/badge.svg
+   :target: https://codecov.io/gh/aio-libs/frozenlist
+   :alt: codecov.io status for master branch
+.. image:: https://img.shields.io/pypi/v/frozenlist.svg?logo=Python&logoColor=white
+   :target: https://pypi.org/project/frozenlist
+   :alt: frozenlist @ PyPI
+.. image:: https://readthedocs.org/projects/frozenlist/badge/?version=latest
+   :target: https://frozenlist.aio-libs.org
+   :alt: Read The Docs build status badge
+.. image:: https://img.shields.io/matrix/aio-libs:matrix.org?label=Discuss%20on%20Matrix%20at%20%23aio-libs%3Amatrix.org&logo=matrix&server_fqdn=matrix.org&style=flat
+   :target: https://matrix.to/#/%23aio-libs:matrix.org
+   :alt: Matrix Room — #aio-libs:matrix.org
+.. image:: https://img.shields.io/matrix/aio-libs-space:matrix.org?label=Discuss%20on%20Matrix%20at%20%23aio-libs-space%3Amatrix.org&logo=matrix&server_fqdn=matrix.org&style=flat
+   :target: https://matrix.to/#/%23aio-libs-space:matrix.org
+   :alt: Matrix Space — #aio-libs-space:matrix.org
+Introduction
+------------
+``frozenlist.FrozenList`` is a list-like structure which implements
+``collections.abc.MutableSequence``. The list is *mutable* until ``FrozenList.freeze``
+is called, after which list modifications raise ``RuntimeError``:
+>>> from frozenlist import FrozenList
+>>> fl = FrozenList([17, 42])
+>>> fl.append('spam')
+>>> fl.append('Vikings')
+>>> fl
+<FrozenList(frozen=False, [17, 42, 'spam', 'Vikings'])>
+>>> fl.freeze()
+>>> fl
+<FrozenList(frozen=True, [17, 42, 'spam', 'Vikings'])>
+>>> fl.frozen
+True
+>>> fl.append("Monty")
+Traceback (most recent call last):
+  File "<stdin>", line 1, in <module>
+  File "frozenlist/_frozenlist.pyx", line 97, in frozenlist._frozenlist.FrozenList.append
+    self._check_frozen()
+  File "frozenlist/_frozenlist.pyx", line 19, in frozenlist._frozenlist.FrozenList._check_frozen
+    raise RuntimeError("Cannot modify frozen list.")
+RuntimeError: Cannot modify frozen list.
+FrozenList is also hashable, but only when frozen. Otherwise it also throws a RuntimeError:
+>>> fl = FrozenList([17, 42, 'spam'])
+>>> hash(fl)
+Traceback (most recent call last):
+  File "<stdin>", line 1, in <module>
+  File "frozenlist/_frozenlist.pyx", line 111, in frozenlist._frozenlist.FrozenList.__hash__
+    raise RuntimeError("Cannot hash unfrozen list.")
+RuntimeError: Cannot hash unfrozen list.
+>>> fl.freeze()
+>>> hash(fl)
+3713081631934410656
+>>> dictionary = {fl: 'Vikings'} # frozen fl can be a dict key
+>>> dictionary
+{<FrozenList(frozen=True, [1, 2])>: 'Vikings'}
+Installation
+------------
+::
+   $ pip install frozenlist
+The library requires Python 3.8 or newer.
+Documentation
+-------------
+https://frozenlist.aio-libs.org
+Communication channels
+----------------------
+We have a *Matrix Space* `#aio-libs-space:matrix.org
+<https://matrix.to/#/%23aio-libs-space:matrix.org>`_ which is
+also accessible via Gitter.
+Requirements
+------------
+- Python >= 3.8
+License
+-------
+``frozenlist`` is offered under the Apache 2 license.
+Source code
+-----------
+The project is hosted on GitHub_
+Please file an issue in the `bug tracker
+<https://github.com/aio-libs/frozenlist/issues>`_ if you have found a bug
+or have some suggestions to improve the library.
+.. _GitHub: https://github.com/aio-libs/frozenlist
+=========
+Changelog
+=========
+..
+    You should *NOT* be adding new change log entries to this file, this
+    file is managed by towncrier. You *may* edit previous change logs to
+    fix problems like typo corrections or such.
+    To add a new change log entry, please see
+    https://pip.pypa.io/en/latest/development/contributing/#news-entries
+    we named the news folder "changes".
+    WARNING: Don't drop the next directive!
+.. towncrier release notes start
+1.5.0 (2024-10-22)
+==================
+Bug fixes
+---------
+- An incorrect signature of the ``__class_getitem__`` class method
+  has been fixed, adding a missing ``class_item`` argument under
+  Python 3.8 and older.
+  This change also improves the code coverage of this method that
+  was previously missing -- by `@webknjaz <https://github.com/sponsors/webknjaz>`__.
+  *Related issues and pull requests on GitHub:*
+  `#567 <https://github.com/aio-libs/frozenlist/issues/567>`__, `#571 <https://github.com/aio-libs/frozenlist/issues/571>`__.
+Improved documentation
+----------------------
+- Rendered issue, PR, and commit links now lead to
+  ``frozenlist``'s repo instead of ``yarl``'s repo.
+  *Related issues and pull requests on GitHub:*
+  `#573 <https://github.com/aio-libs/frozenlist/issues/573>`__.
+- On the ``Contributing docs`` page,
+  a link to the ``Towncrier philosophy`` has been fixed.
+  *Related issues and pull requests on GitHub:*
+  `#574 <https://github.com/aio-libs/frozenlist/issues/574>`__.
+Packaging updates and notes for downstreams
+-------------------------------------------
+- A name of a temporary building directory now reflects
+  that it's related to ``frozenlist``, not ``yarl``.
+  *Related issues and pull requests on GitHub:*
+  `#573 <https://github.com/aio-libs/frozenlist/issues/573>`__.
+- Declared Python 3.13 supported officially in the distribution package metadata.
+  *Related issues and pull requests on GitHub:*
+  `#595 <https://github.com/aio-libs/frozenlist/issues/595>`__.
+----
+1.4.1 (2023-12-15)
+==================
+Packaging updates and notes for downstreams
+-------------------------------------------
+- Declared Python 3.12 and PyPy 3.8-3.10 supported officially
+  in the distribution package metadata.
+  *Related issues and pull requests on GitHub:*
+  `#553 <https://github.com/aio-libs/frozenlist/issues/553>`__.
+- Replaced the packaging is replaced from an old-fashioned ``setup.py`` to an
+  in-tree `PEP 517 <https://peps.python.org/pep-517>`__ build backend -- by `@webknjaz <https://github.com/sponsors/webknjaz>`__.
+  Whenever the end-users or downstream packagers need to build ``frozenlist``
+  from source (a Git checkout or an sdist), they may pass a ``config_settings``
+  flag ``pure-python``. If this flag is not set, a C-extension will be built
+  and included into the distribution.
+  Here is how this can be done with ``pip``:
+  .. code-block:: console
+      $ python3 -m pip install . --config-settings=pure-python=
+  This will also work with ``-e | --editable``.
+  The same can be achieved via ``pypa/build``:
+  .. code-block:: console
+      $ python3 -m build --config-setting=pure-python=
+  Adding ``-w | --wheel`` can force ``pypa/build`` produce a wheel from source
+  directly, as opposed to building an ``sdist`` and then building from it.
+  *Related issues and pull requests on GitHub:*
+  `#560 <https://github.com/aio-libs/frozenlist/issues/560>`__.
+Contributor-facing changes
+--------------------------
+- It is now possible to request line tracing in Cython builds using the
+  ``with-cython-tracing`` `PEP 517 <https://peps.python.org/pep-517>`__ config setting
+  -- `@webknjaz <https://github.com/sponsors/webknjaz>`__.
+  This can be used in CI and development environment to measure coverage
+  on Cython modules, but is not normally useful to the end-users or
+  downstream packagers.
+  Here's a usage example:
+  .. code-block:: console
+      $ python3 -Im pip install . --config-settings=with-cython-tracing=true
+  For editable installs, this setting is on by default. Otherwise, it's
+  off unless requested explicitly.
+  The following produces C-files required for the Cython coverage
+  plugin to map the measurements back to the PYX-files:
+  .. code-block:: console
+      $ python -Im pip install -e .
+  Alternatively, the ``FROZENLIST_CYTHON_TRACING=1`` environment variable
+  can be set to do the same as the `PEP 517 <https://peps.python.org/pep-517>`__ config setting.
+  *Related issues and pull requests on GitHub:*
+  `#560 <https://github.com/aio-libs/frozenlist/issues/560>`__.
+- Coverage collection has been implemented for the Cython modules
+  -- by `@webknjaz <https://github.com/sponsors/webknjaz>`__.
+  It will also be reported to Codecov from any non-release CI jobs.
+  *Related issues and pull requests on GitHub:*
+  `#561 <https://github.com/aio-libs/frozenlist/issues/561>`__.
+- A step-by-step ``Release Guide`` guide has
+  been added, describing how to release *frozenlist* -- by `@webknjaz <https://github.com/sponsors/webknjaz>`__.
+  This is primarily targeting the maintainers.
+  *Related issues and pull requests on GitHub:*
+  `#563 <https://github.com/aio-libs/frozenlist/issues/563>`__.
+- Detailed ``Contributing Guidelines`` on
+  authoring the changelog fragments have been published in the
+  documentation -- by `@webknjaz <https://github.com/sponsors/webknjaz>`__.
+  *Related issues and pull requests on GitHub:*
+  `#564 <https://github.com/aio-libs/frozenlist/issues/564>`__.
+----
+1.4.0 (2023-07-12)
+==================
+The published source distribution package became buildable
+under Python 3.12.
+----
+Bugfixes
+--------
+- Removed an unused ``typing.Tuple`` import
+  `#411 <https://github.com/aio-libs/frozenlist/issues/411>`_
+Deprecations and Removals
+-------------------------
+- Dropped Python 3.7 support.
+  `#413 <https://github.com/aio-libs/frozenlist/issues/413>`_
+Misc
+----
+- `#410 <https://github.com/aio-libs/frozenlist/issues/410>`_, `#433 <https://github.com/aio-libs/frozenlist/issues/433>`_
+----
+1.3.3 (2022-11-08)
+==================
+- Fixed CI runs when creating a new release, where new towncrier versions
+  fail when the current version section is already present.
+----
+1.3.2 (2022-11-08)
+==================
+Misc
+----
+- Updated the CI runs to better check for test results and to avoid deprecated syntax. `#327 <https://github.com/aio-libs/frozenlist/issues/327>`_
+----
+1.3.1 (2022-08-02)
+==================
+The published source distribution package became buildable
+under Python 3.11.
+----
+1.3.0 (2022-01-18)
+==================
+Bugfixes
+--------
+- Do not install C sources with binary distributions.
+  `#250 <https://github.com/aio-libs/frozenlist/issues/250>`_
+Deprecations and Removals
+-------------------------
+- Dropped Python 3.6 support
+  `#274 <https://github.com/aio-libs/frozenlist/issues/274>`_
+----
+1.2.0 (2021-10-16)
+==================
+Features
+--------
+- ``FrozenList`` now supports being used as a generic type as per PEP 585, e.g. ``frozen_int_list: FrozenList[int]`` (requires Python 3.9 or newer).
+  `#172 <https://github.com/aio-libs/frozenlist/issues/172>`_
+- Added support for Python 3.10.
+  `#227 <https://github.com/aio-libs/frozenlist/issues/227>`_
+- Started shipping platform-specific wheels with the ``musl`` tag targeting typical Alpine Linux runtimes.
+  `#227 <https://github.com/aio-libs/frozenlist/issues/227>`_
+- Started shipping platform-specific arm64 wheels for Apple Silicon.
+  `#227 <https://github.com/aio-libs/frozenlist/issues/227>`_
+----
+1.1.1 (2020-11-14)
+==================
+Bugfixes
+--------
+- Provide x86 Windows wheels.
+  `#169 <https://github.com/aio-libs/frozenlist/issues/169>`_
+----
+1.1.0 (2020-10-13)
+==================
+Features
+--------
+- Add support for hashing of a frozen list.
+  `#136 <https://github.com/aio-libs/frozenlist/issues/136>`_
+- Support Python 3.8 and 3.9.
+- Provide wheels for ``aarch64``, ``i686``, ``ppc64le``, ``s390x`` architectures on
+  Linux as well as ``x86_64``.
+----
+1.0.0 (2019-11-09)
+==================
+Deprecations and Removals
+-------------------------
+- Dropped support for Python 3.5; only 3.6, 3.7 and 3.8 are supported going forward.
+  `#24 <https://github.com/aio-libs/frozenlist/issues/24>`_

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/frozenlist-1.5.0.dist-info/RECORD ADDED Viewed

	@@ -0,0 +1,12 @@

+frozenlist-1.5.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
+frozenlist-1.5.0.dist-info/LICENSE,sha256=b9UkPpLdf5jsacesN3co50kFcJ_1J6W_mNbQJjwE9bY,11332
+frozenlist-1.5.0.dist-info/METADATA,sha256=BpQvB7z2NbU3f4XTQDvhAZ9L08WR4XiYajilj9IY6Yk,13762
+frozenlist-1.5.0.dist-info/RECORD,,
+frozenlist-1.5.0.dist-info/WHEEL,sha256=64hRuO2b8JU2aeheZgbK9oQwal3JVqwtqRhpQNr8ZdQ,224
+frozenlist-1.5.0.dist-info/top_level.txt,sha256=jivtxsPXA3nK3WBWW2LW5Mtu_GHt8UZA13NeCs2cKuA,11
+frozenlist/__init__.py,sha256=ymVtnW3MinO-Ux3cBj_PLEpXnmLawk45el8vcX6IkWY,2371
+frozenlist/__init__.pyi,sha256=vMEoES1xGegPtVXoCi9XydEeHsyuIq-KdeXwP5PdsaA,1470
+frozenlist/__pycache__/__init__.cpython-312.pyc,,
+frozenlist/_frozenlist.cpython-312-x86_64-linux-gnu.so,sha256=n65G8t1lqSUcWICd9rjOJujV1lxtniI2JJQQXtc7BjQ,961592
+frozenlist/_frozenlist.pyx,sha256=4YturclNF7wioO7YX3Vzl7Ldb2-iswe6UrjJOMKSswU,2993
+frozenlist/py.typed,sha256=sow9soTwP9T_gEAQSVh7Gb8855h04Nwmhs2We-JRgZM,7

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/frozenlist-1.5.0.dist-info/WHEEL ADDED Viewed

	@@ -0,0 +1,8 @@

+Wheel-Version: 1.0
+Generator: setuptools (75.2.0)
+Root-Is-Purelib: false
+Tag: cp312-cp312-manylinux_2_5_x86_64
+Tag: cp312-cp312-manylinux1_x86_64
+Tag: cp312-cp312-manylinux_2_17_x86_64
+Tag: cp312-cp312-manylinux2014_x86_64

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/frozenlist-1.5.0.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ frozenlist

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/h11-0.14.0.dist-info/INSTALLER ADDED Viewed

	@@ -0,0 +1 @@


1	+ pip

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/h11-0.14.0.dist-info/LICENSE.txt ADDED Viewed

	@@ -0,0 +1,22 @@

+The MIT License (MIT)
+Copyright (c) 2016 Nathaniel J. Smith <njs@pobox.com> and other contributors
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/h11-0.14.0.dist-info/METADATA ADDED Viewed

	@@ -0,0 +1,193 @@

+Metadata-Version: 2.1
+Name: h11
+Version: 0.14.0
+Summary: A pure-Python, bring-your-own-I/O implementation of HTTP/1.1
+Home-page: https://github.com/python-hyper/h11
+Author: Nathaniel J. Smith
+Author-email: njs@pobox.com
+License: MIT
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: Implementation :: CPython
+Classifier: Programming Language :: Python :: Implementation :: PyPy
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3 :: Only
+Classifier: Programming Language :: Python :: 3.7
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Topic :: Internet :: WWW/HTTP
+Classifier: Topic :: System :: Networking
+Requires-Python: >=3.7
+License-File: LICENSE.txt
+Requires-Dist: typing-extensions ; python_version < "3.8"
+h11
+===
+.. image:: https://travis-ci.org/python-hyper/h11.svg?branch=master
+   :target: https://travis-ci.org/python-hyper/h11
+   :alt: Automated test status
+.. image:: https://codecov.io/gh/python-hyper/h11/branch/master/graph/badge.svg
+   :target: https://codecov.io/gh/python-hyper/h11
+   :alt: Test coverage
+.. image:: https://readthedocs.org/projects/h11/badge/?version=latest
+   :target: http://h11.readthedocs.io/en/latest/?badge=latest
+   :alt: Documentation Status
+This is a little HTTP/1.1 library written from scratch in Python,
+heavily inspired by `hyper-h2 <https://hyper-h2.readthedocs.io/>`_.
+It's a "bring-your-own-I/O" library; h11 contains no IO code
+whatsoever. This means you can hook h11 up to your favorite network
+API, and that could be anything you want: synchronous, threaded,
+asynchronous, or your own implementation of `RFC 6214
+<https://tools.ietf.org/html/rfc6214>`_ -- h11 won't judge you.
+(Compare this to the current state of the art, where every time a `new
+network API <https://trio.readthedocs.io/>`_ comes along then someone
+gets to start over reimplementing the entire HTTP protocol from
+scratch.) Cory Benfield made an `excellent blog post describing the
+benefits of this approach
+<https://lukasa.co.uk/2015/10/The_New_Hyper/>`_, or if you like video
+then here's his `PyCon 2016 talk on the same theme
+<https://www.youtube.com/watch?v=7cC3_jGwl_U>`_.
+This also means that h11 is not immediately useful out of the box:
+it's a toolkit for building programs that speak HTTP, not something
+that could directly replace ``requests`` or ``twisted.web`` or
+whatever. But h11 makes it much easier to implement something like
+``requests`` or ``twisted.web``.
+At a high level, working with h11 goes like this:
+1) First, create an ``h11.Connection`` object to track the state of a
+   single HTTP/1.1 connection.
+2) When you read data off the network, pass it to
+   ``conn.receive_data(...)``; you'll get back a list of objects
+   representing high-level HTTP "events".
+3) When you want to send a high-level HTTP event, create the
+   corresponding "event" object and pass it to ``conn.send(...)``;
+   this will give you back some bytes that you can then push out
+   through the network.
+For example, a client might instantiate and then send a
+``h11.Request`` object, then zero or more ``h11.Data`` objects for the
+request body (e.g., if this is a POST), and then a
+``h11.EndOfMessage`` to indicate the end of the message. Then the
+server would then send back a ``h11.Response``, some ``h11.Data``, and
+its own ``h11.EndOfMessage``. If either side violates the protocol,
+you'll get a ``h11.ProtocolError`` exception.
+h11 is suitable for implementing both servers and clients, and has a
+pleasantly symmetric API: the events you send as a client are exactly
+the ones that you receive as a server and vice-versa.
+`Here's an example of a tiny HTTP client
+<https://github.com/python-hyper/h11/blob/master/examples/basic-client.py>`_
+It also has `a fine manual <https://h11.readthedocs.io/>`_.
+FAQ
+---
+*Whyyyyy?*
+I wanted to play with HTTP in `Curio
+<https://curio.readthedocs.io/en/latest/tutorial.html>`__ and `Trio
+<https://trio.readthedocs.io>`__, which at the time didn't have any
+HTTP libraries. So I thought, no big deal, Python has, like, a dozen
+different implementations of HTTP, surely I can find one that's
+reusable. I didn't find one, but I did find Cory's call-to-arms
+blog-post. So I figured, well, fine, if I have to implement HTTP from
+scratch, at least I can make sure no-one *else* has to ever again.
+*Should I use it?*
+Maybe. You should be aware that it's a very young project. But, it's
+feature complete and has an exhaustive test-suite and complete docs,
+so the next step is for people to try using it and see how it goes
+:-). If you do then please let us know -- if nothing else we'll want
+to talk to you before making any incompatible changes!
+*What are the features/limitations?*
+Roughly speaking, it's trying to be a robust, complete, and non-hacky
+implementation of the first "chapter" of the HTTP/1.1 spec: `RFC 7230:
+HTTP/1.1 Message Syntax and Routing
+<https://tools.ietf.org/html/rfc7230>`_. That is, it mostly focuses on
+implementing HTTP at the level of taking bytes on and off the wire,
+and the headers related to that, and tries to be anal about spec
+conformance. It doesn't know about higher-level concerns like URL
+routing, conditional GETs, cross-origin cookie policies, or content
+negotiation. But it does know how to take care of framing,
+cross-version differences in keep-alive handling, and the "obsolete
+line folding" rule, so you can focus your energies on the hard /
+interesting parts for your application, and it tries to support the
+full specification in the sense that any useful HTTP/1.1 conformant
+application should be able to use h11.
+It's pure Python, and has no dependencies outside of the standard
+library.
+It has a test suite with 100.0% coverage for both statements and
+branches.
+Currently it supports Python 3 (testing on 3.7-3.10) and PyPy 3.
+The last Python 2-compatible version was h11 0.11.x.
+(Originally it had a Cython wrapper for `http-parser
+<https://github.com/nodejs/http-parser>`_ and a beautiful nested state
+machine implemented with ``yield from`` to postprocess the output. But
+I had to take these out -- the new *parser* needs fewer lines-of-code
+than the old *parser wrapper*, is written in pure Python, uses no
+exotic language syntax, and has more features. It's sad, really; that
+old state machine was really slick. I just need a few sentences here
+to mourn that.)
+I don't know how fast it is. I haven't benchmarked or profiled it yet,
+so it's probably got a few pointless hot spots, and I've been trying
+to err on the side of simplicity and robustness instead of
+micro-optimization. But at the architectural level I tried hard to
+avoid fundamentally bad decisions, e.g., I believe that all the
+parsing algorithms remain linear-time even in the face of pathological
+input like slowloris, and there are no byte-by-byte loops. (I also
+believe that it maintains bounded memory usage in the face of
+arbitrary/pathological input.)
+The whole library is ~800 lines-of-code. You can read and understand
+the whole thing in less than an hour. Most of the energy invested in
+this so far has been spent on trying to keep things simple by
+minimizing special-cases and ad hoc state manipulation; even though it
+is now quite small and simple, I'm still annoyed that I haven't
+figured out how to make it even smaller and simpler. (Unfortunately,
+HTTP does not lend itself to simplicity.)
+The API is ~feature complete and I don't expect the general outlines
+to change much, but you can't judge an API's ergonomics until you
+actually document and use it, so I'd expect some changes in the
+details.
+*How do I try it?*
+.. code-block:: sh
+  $ pip install h11
+  $ git clone git@github.com:python-hyper/h11
+  $ cd h11/examples
+  $ python basic-client.py
+and go from there.
+*License?*
+MIT
+*Code of conduct?*
+Contributors are requested to follow our `code of conduct
+<https://github.com/python-hyper/h11/blob/master/CODE_OF_CONDUCT.md>`_ in
+all project spaces.

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/h11-0.14.0.dist-info/RECORD ADDED Viewed

	@@ -0,0 +1,52 @@

+h11-0.14.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
+h11-0.14.0.dist-info/LICENSE.txt,sha256=N9tbuFkm2yikJ6JYZ_ELEjIAOuob5pzLhRE4rbjm82E,1124
+h11-0.14.0.dist-info/METADATA,sha256=B7pZ0m7WBXNs17vl6hUH9bJTL9s37DaGvY31w7jNxSg,8175
+h11-0.14.0.dist-info/RECORD,,
+h11-0.14.0.dist-info/WHEEL,sha256=ewwEueio1C2XeHTvT17n8dZUJgOvyCWCt0WVNLClP9o,92
+h11-0.14.0.dist-info/top_level.txt,sha256=F7dC4jl3zeh8TGHEPaWJrMbeuoWbS379Gwdi-Yvdcis,4
+h11/__init__.py,sha256=iO1KzkSO42yZ6ffg-VMgbx_ZVTWGUY00nRYEWn-s3kY,1507
+h11/__pycache__/__init__.cpython-312.pyc,,
+h11/__pycache__/_abnf.cpython-312.pyc,,
+h11/__pycache__/_connection.cpython-312.pyc,,
+h11/__pycache__/_events.cpython-312.pyc,,
+h11/__pycache__/_headers.cpython-312.pyc,,
+h11/__pycache__/_readers.cpython-312.pyc,,
+h11/__pycache__/_receivebuffer.cpython-312.pyc,,
+h11/__pycache__/_state.cpython-312.pyc,,
+h11/__pycache__/_util.cpython-312.pyc,,
+h11/__pycache__/_version.cpython-312.pyc,,
+h11/__pycache__/_writers.cpython-312.pyc,,
+h11/_abnf.py,sha256=ybixr0xsupnkA6GFAyMubuXF6Tc1lb_hF890NgCsfNc,4815
+h11/_connection.py,sha256=eS2sorMD0zKLCFiB9lW9W9F_Nzny2tjHa4e6s1ujr1c,26539
+h11/_events.py,sha256=LEfuvg1AbhHaVRwxCd0I-pFn9-ezUOaoL8o2Kvy1PBA,11816
+h11/_headers.py,sha256=RqB8cd8CN0blYPzcLe5qeCh-phv6D1U_CHj4hs67lgQ,10230
+h11/_readers.py,sha256=EbSed0jzwVUiD1nOPAeUcVE4Flf3wXkxfb8c06-OTBM,8383
+h11/_receivebuffer.py,sha256=xrspsdsNgWFxRfQcTXxR8RrdjRXXTK0Io5cQYWpJ1Ws,5252
+h11/_state.py,sha256=k1VL6SDbaPkSrZ-49ewCXDpuiUS69_46YhbWjuV1qEY,13300
+h11/_util.py,sha256=LWkkjXyJaFlAy6Lt39w73UStklFT5ovcvo0TkY7RYuk,4888
+h11/_version.py,sha256=LVyTdiZRzIIEv79UyOgbM5iUrJUllEzlCWaJEYBY1zc,686
+h11/_writers.py,sha256=oFKm6PtjeHfbj4RLX7VB7KDc1gIY53gXG3_HR9ltmTA,5081
+h11/py.typed,sha256=sow9soTwP9T_gEAQSVh7Gb8855h04Nwmhs2We-JRgZM,7
+h11/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+h11/tests/__pycache__/__init__.cpython-312.pyc,,
+h11/tests/__pycache__/helpers.cpython-312.pyc,,
+h11/tests/__pycache__/test_against_stdlib_http.cpython-312.pyc,,
+h11/tests/__pycache__/test_connection.cpython-312.pyc,,
+h11/tests/__pycache__/test_events.cpython-312.pyc,,
+h11/tests/__pycache__/test_headers.cpython-312.pyc,,
+h11/tests/__pycache__/test_helpers.cpython-312.pyc,,
+h11/tests/__pycache__/test_io.cpython-312.pyc,,
+h11/tests/__pycache__/test_receivebuffer.cpython-312.pyc,,
+h11/tests/__pycache__/test_state.cpython-312.pyc,,
+h11/tests/__pycache__/test_util.cpython-312.pyc,,
+h11/tests/data/test-file,sha256=ZJ03Rqs98oJw29OHzJg7LlMzyGQaRAY0r3AqBeM2wVU,65
+h11/tests/helpers.py,sha256=a1EVG_p7xU4wRsa3tMPTRxuaKCmretok9sxXWvqfmQA,3355
+h11/tests/test_against_stdlib_http.py,sha256=cojCHgHXFQ8gWhNlEEwl3trmOpN-5uDukRoHnElqo3A,3995
+h11/tests/test_connection.py,sha256=ZbPLDPclKvjgjAhgk-WlCPBaf17c4XUIV2tpaW08jOI,38720
+h11/tests/test_events.py,sha256=LPVLbcV-NvPNK9fW3rraR6Bdpz1hAlsWubMtNaJ5gHg,4657
+h11/tests/test_headers.py,sha256=qd8T1Zenuz5GbD6wklSJ5G8VS7trrYgMV0jT-SMvqg8,5612
+h11/tests/test_helpers.py,sha256=kAo0CEM4LGqmyyP2ZFmhsyq3UFJqoFfAbzu3hbWreRM,794
+h11/tests/test_io.py,sha256=uCZVnjarkRBkudfC1ij-KSCQ71XWJhnkgkgWWkKgYPQ,16386
+h11/tests/test_receivebuffer.py,sha256=3jGbeJM36Akqg_pAhPb7XzIn2NS6RhPg-Ryg8Eu6ytk,3454
+h11/tests/test_state.py,sha256=rqll9WqFsJPE0zSrtCn9LH659mPKsDeXZ-DwXwleuBQ,8928
+h11/tests/test_util.py,sha256=VO5L4nSFe4pgtSwKuv6u_6l0H7UeizF5WKuHTWreg70,2970

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/h11-0.14.0.dist-info/WHEEL ADDED Viewed

	@@ -0,0 +1,5 @@

+Wheel-Version: 1.0
+Generator: bdist_wheel (0.37.0)
+Root-Is-Purelib: true
+Tag: py3-none-any

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/h11-0.14.0.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ h11

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/multidict-6.1.0.dist-info/INSTALLER ADDED Viewed

	@@ -0,0 +1 @@


1	+ pip

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/multidict-6.1.0.dist-info/LICENSE ADDED Viewed

	@@ -0,0 +1,13 @@

+   Copyright 2016 Andrew Svetlov and aio-libs contributors
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/multidict-6.1.0.dist-info/METADATA ADDED Viewed

	@@ -0,0 +1,140 @@

+Metadata-Version: 2.1
+Name: multidict
+Version: 6.1.0
+Summary: multidict implementation
+Home-page: https://github.com/aio-libs/multidict
+Author: Andrew Svetlov
+Author-email: andrew.svetlov@gmail.com
+License: Apache 2
+Project-URL: Chat: Matrix, https://matrix.to/#/#aio-libs:matrix.org
+Project-URL: Chat: Matrix Space, https://matrix.to/#/#aio-libs-space:matrix.org
+Project-URL: CI: GitHub, https://github.com/aio-libs/multidict/actions
+Project-URL: Code of Conduct, https://github.com/aio-libs/.github/blob/master/CODE_OF_CONDUCT.md
+Project-URL: Coverage: codecov, https://codecov.io/github/aio-libs/multidict
+Project-URL: Docs: Changelog, https://multidict.aio-libs.org/en/latest/changes/
+Project-URL: Docs: RTD, https://multidict.aio-libs.org
+Project-URL: GitHub: issues, https://github.com/aio-libs/multidict/issues
+Project-URL: GitHub: repo, https://github.com/aio-libs/multidict
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: Apache Software License
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Requires-Python: >=3.8
+Description-Content-Type: text/x-rst
+License-File: LICENSE
+Requires-Dist: typing-extensions >=4.1.0 ; python_version < "3.11"
+=========
+multidict
+=========
+.. image:: https://github.com/aio-libs/multidict/actions/workflows/ci-cd.yml/badge.svg
+   :target: https://github.com/aio-libs/multidict/actions
+   :alt: GitHub status for master branch
+.. image:: https://codecov.io/gh/aio-libs/multidict/branch/master/graph/badge.svg
+   :target: https://codecov.io/gh/aio-libs/multidict
+   :alt: Coverage metrics
+.. image:: https://img.shields.io/pypi/v/multidict.svg
+   :target: https://pypi.org/project/multidict
+   :alt: PyPI
+.. image:: https://readthedocs.org/projects/multidict/badge/?version=latest
+   :target: https://multidict.aio-libs.org
+   :alt: Read The Docs build status badge
+.. image:: https://img.shields.io/pypi/pyversions/multidict.svg
+   :target: https://pypi.org/project/multidict
+   :alt: Python versions
+.. image:: https://img.shields.io/matrix/aio-libs:matrix.org?label=Discuss%20on%20Matrix%20at%20%23aio-libs%3Amatrix.org&logo=matrix&server_fqdn=matrix.org&style=flat
+   :target: https://matrix.to/#/%23aio-libs:matrix.org
+   :alt: Matrix Room — #aio-libs:matrix.org
+.. image:: https://img.shields.io/matrix/aio-libs-space:matrix.org?label=Discuss%20on%20Matrix%20at%20%23aio-libs-space%3Amatrix.org&logo=matrix&server_fqdn=matrix.org&style=flat
+   :target: https://matrix.to/#/%23aio-libs-space:matrix.org
+   :alt: Matrix Space — #aio-libs-space:matrix.org
+Multidict is dict-like collection of *key-value pairs* where key
+might occur more than once in the container.
+Introduction
+------------
+*HTTP Headers* and *URL query string* require specific data structure:
+*multidict*. It behaves mostly like a regular ``dict`` but it may have
+several *values* for the same *key* and *preserves insertion ordering*.
+The *key* is ``str`` (or ``istr`` for case-insensitive dictionaries).
+``multidict`` has four multidict classes:
+``MultiDict``, ``MultiDictProxy``, ``CIMultiDict``
+and ``CIMultiDictProxy``.
+Immutable proxies (``MultiDictProxy`` and
+``CIMultiDictProxy``) provide a dynamic view for the
+proxied multidict, the view reflects underlying collection changes. They
+implement the ``collections.abc.Mapping`` interface.
+Regular mutable (``MultiDict`` and ``CIMultiDict``) classes
+implement ``collections.abc.MutableMapping`` and allows them to change
+their own content.
+*Case insensitive* (``CIMultiDict`` and
+``CIMultiDictProxy``) assume the *keys* are case
+insensitive, e.g.::
+   >>> dct = CIMultiDict(key='val')
+   >>> 'Key' in dct
+   True
+   >>> dct['Key']
+   'val'
+*Keys* should be ``str`` or ``istr`` instances.
+The library has optional C Extensions for speed.
+License
+-------
+Apache 2
+Library Installation
+--------------------
+.. code-block:: bash
+   $ pip install multidict
+The library is Python 3 only!
+PyPI contains binary wheels for Linux, Windows and MacOS.  If you want to install
+``multidict`` on another operating system (or *Alpine Linux* inside a Docker) the
+tarball will be used to compile the library from source.  It requires a C compiler and
+Python headers to be installed.
+To skip the compilation, please use the `MULTIDICT_NO_EXTENSIONS` environment variable,
+e.g.:
+.. code-block:: bash
+   $ MULTIDICT_NO_EXTENSIONS=1 pip install multidict
+Please note, the pure Python (uncompiled) version is about 20-50 times slower depending on
+the usage scenario!!!
+Changelog
+---------
+See `RTD page <http://multidict.aio-libs.org/en/latest/changes>`_.

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/multidict-6.1.0.dist-info/RECORD ADDED Viewed

	@@ -0,0 +1,19 @@

+multidict-6.1.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
+multidict-6.1.0.dist-info/LICENSE,sha256=k9Ealo4vDzY3PECBH_bSDhc_WMPKtYhM1mF7v9eVSSo,611
+multidict-6.1.0.dist-info/METADATA,sha256=OnCx5DR4XPf64GIDK4XmcA2e7HLQ_784vMfEQy287kM,4979
+multidict-6.1.0.dist-info/RECORD,,
+multidict-6.1.0.dist-info/WHEEL,sha256=3FRagTIevYnyede1Gym_XNKguJrd07UOyEdLNhxNq20,151
+multidict-6.1.0.dist-info/top_level.txt,sha256=-euDElkk5_qkmfIJ7WiqCab02ZlSFZWynejKg59qZQQ,10
+multidict/__init__.py,sha256=p60Ag5UVACSli1txazSi85foCmHN-cg3qZDCuWdOKng,928
+multidict/__init__.pyi,sha256=SbgC2ew1NvNXWlRKs9o0KhW4moozgMqgQ0OA4Re5JQQ,4840
+multidict/__pycache__/__init__.cpython-312.pyc,,
+multidict/__pycache__/_abc.cpython-312.pyc,,
+multidict/__pycache__/_compat.cpython-312.pyc,,
+multidict/__pycache__/_multidict_base.cpython-312.pyc,,
+multidict/__pycache__/_multidict_py.cpython-312.pyc,,
+multidict/_abc.py,sha256=Zvnrn4SBkrv4QTD7-ZzqNcoxw0f8KStLMPzGvBuGT2w,1190
+multidict/_compat.py,sha256=uCNUpVHJSFOiKUJmRcz3SDqMpkb37C_csc29ijr8Evo,352
+multidict/_multidict.cpython-312-x86_64-linux-gnu.so,sha256=6BwP62oLns2chEgPfwAa8DseIoF0wOWBe81pHjnlqhs,418968
+multidict/_multidict_base.py,sha256=ZndtnZ5oc1sODKmXsv6F9kWvVNCda9xAEEFXkaPoFoA,3979
+multidict/_multidict_py.py,sha256=57h4sYrRIu7EjMX4YpHVIZVrV9-q1KCW3F6rao10D3U,15050
+multidict/py.typed,sha256=e9bmbH3UFxsabQrnNFPG9qxIXztwbcM6IKDYnvZwprY,15

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/multidict-6.1.0.dist-info/WHEEL ADDED Viewed

	@@ -0,0 +1,6 @@

+Wheel-Version: 1.0
+Generator: setuptools (74.1.2)
+Root-Is-Purelib: false
+Tag: cp312-cp312-manylinux_2_17_x86_64
+Tag: cp312-cp312-manylinux2014_x86_64