Lekr0 commited on 7 days ago

Commit

d522318

verified ·

1 Parent(s): 4024ed7

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

SpecForge-ext/benchmarks/README.md +67 -0
SpecForge-ext/benchmarks/__init__.py +3 -0
SpecForge-ext/benchmarks/bench_eagle3.py +268 -0
SpecForge-ext/benchmarks/benchmarker/__init__.py +29 -0
SpecForge-ext/benchmarks/benchmarker/__pycache__/__init__.cpython-310.pyc +0 -0
SpecForge-ext/benchmarks/benchmarker/__pycache__/__init__.cpython-311.pyc +0 -0
SpecForge-ext/benchmarks/benchmarker/__pycache__/__init__.cpython-312.pyc +0 -0
SpecForge-ext/benchmarks/benchmarker/__pycache__/aime.cpython-310.pyc +0 -0
SpecForge-ext/benchmarks/benchmarker/__pycache__/aime.cpython-311.pyc +0 -0
SpecForge-ext/benchmarks/benchmarker/__pycache__/aime.cpython-312.pyc +0 -0
SpecForge-ext/benchmarks/benchmarker/__pycache__/base.cpython-310.pyc +0 -0
SpecForge-ext/benchmarks/benchmarker/__pycache__/base.cpython-311.pyc +0 -0
SpecForge-ext/benchmarks/benchmarker/__pycache__/base.cpython-312.pyc +0 -0
SpecForge-ext/benchmarks/benchmarker/__pycache__/ceval.cpython-310.pyc +0 -0
SpecForge-ext/benchmarks/benchmarker/__pycache__/ceval.cpython-311.pyc +0 -0
SpecForge-ext/benchmarks/benchmarker/__pycache__/financeqa.cpython-310.pyc +0 -0
SpecForge-ext/benchmarks/benchmarker/__pycache__/financeqa.cpython-311.pyc +0 -0
SpecForge-ext/benchmarks/benchmarker/__pycache__/gpqa.cpython-310.pyc +0 -0
SpecForge-ext/benchmarks/benchmarker/__pycache__/gpqa.cpython-311.pyc +0 -0
SpecForge-ext/benchmarks/benchmarker/__pycache__/gsm8k.cpython-310.pyc +0 -0
SpecForge-ext/benchmarks/benchmarker/__pycache__/gsm8k.cpython-311.pyc +0 -0
SpecForge-ext/benchmarks/benchmarker/__pycache__/humaneval.cpython-310.pyc +0 -0
SpecForge-ext/benchmarks/benchmarker/__pycache__/humaneval.cpython-311.pyc +0 -0
SpecForge-ext/benchmarks/benchmarker/__pycache__/livecodebench.cpython-310.pyc +0 -0
SpecForge-ext/benchmarks/benchmarker/__pycache__/math500.cpython-310.pyc +0 -0
SpecForge-ext/benchmarks/benchmarker/__pycache__/math500.cpython-311.pyc +0 -0
SpecForge-ext/benchmarks/benchmarker/__pycache__/mmlu.cpython-310.pyc +0 -0
SpecForge-ext/benchmarks/benchmarker/__pycache__/mmlu.cpython-311.pyc +0 -0
SpecForge-ext/benchmarks/benchmarker/__pycache__/mmstar.cpython-310.pyc +0 -0
SpecForge-ext/benchmarks/benchmarker/__pycache__/mmstar.cpython-311.pyc +0 -0
SpecForge-ext/benchmarks/benchmarker/__pycache__/mtbench.cpython-310.pyc +0 -0
SpecForge-ext/benchmarks/benchmarker/__pycache__/mtbench.cpython-311.pyc +0 -0
SpecForge-ext/benchmarks/benchmarker/__pycache__/registry.cpython-310.pyc +0 -0
SpecForge-ext/benchmarks/benchmarker/__pycache__/registry.cpython-311.pyc +0 -0
SpecForge-ext/benchmarks/benchmarker/__pycache__/simpleqa.cpython-310.pyc +0 -0
SpecForge-ext/benchmarks/benchmarker/__pycache__/simpleqa.cpython-311.pyc +0 -0
SpecForge-ext/benchmarks/benchmarker/__pycache__/utils.cpython-310.pyc +0 -0
SpecForge-ext/benchmarks/benchmarker/__pycache__/utils.cpython-311.pyc +0 -0
SpecForge-ext/benchmarks/benchmarker/aime.py +133 -0
SpecForge-ext/benchmarks/benchmarker/base.py +218 -0
SpecForge-ext/benchmarks/benchmarker/ceval.py +267 -0
SpecForge-ext/benchmarks/benchmarker/financeqa.py +59 -0
SpecForge-ext/benchmarks/benchmarker/gpqa.py +85 -0
SpecForge-ext/benchmarks/benchmarker/gsm8k.py +108 -0
SpecForge-ext/benchmarks/benchmarker/humaneval.py +201 -0
SpecForge-ext/benchmarks/benchmarker/livecodebench.py +46 -0
SpecForge-ext/benchmarks/benchmarker/math500.py +122 -0
SpecForge-ext/benchmarks/benchmarker/mmlu.py +82 -0
SpecForge-ext/benchmarks/benchmarker/mmstar.py +185 -0
SpecForge-ext/benchmarks/benchmarker/mtbench.py +70 -0

SpecForge-ext/benchmarks/README.md ADDED Viewed

	@@ -0,0 +1,67 @@

+# Benchmarking for Speculative Decoding
+## Overview
+We provided a unified script to test the performance of the Speculative Decoding with EAGLE3 algorithm on multiple datasets. You can follow the steps below to run the benchmarks.
+## Run Benchmarks
+### Launch SGLang and Benchmarker Concurrently
+`bench_eagle3.py` can help you launch a SGLang server process and a Benchmarking process concurrently. In this way, you don't have to launch the SGLang server manually, this script will manually handle the SGLang launch under different speculative decoding configurations. Some important arguments are:
+- `--model-path`: the path to the target model.
+- `--speculative-draft-model-path`: the path to the draft model.
+- `--port`: the port to launch the SGLang server.
+- `--trust-remote-code`: trust the remote code.
+- `--mem-fraction-static`: the memory fraction for the static memory.
+- `--tp-size`: the tensor parallelism size.
+- `--attention-backend`: the attention backend.
+- `--config-list`: the list of speculative decoding configuration to test, the format is `<batch-size>,<num-steps>,<topk>,<num-draft-tokens>`.
+- `--benchmark-list`: the list of benchmarks to test, the format is `<benchmark-name>:<num-prompts>:<subset>`.
+```shell
+python3 bench_eagle3.py \
+    --model-path meta-llama/Llama-3.1-8B-Instruct \
+    --speculative-draft-model-path lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B \
+    --port 30000 \
+    --trust-remote-code \
+    --mem-fraction-static 0.8 \
+    --tp-size 1 \
+    --attention-backend fa3 \
+    --config-list 1,0,0,0 1,3,1,4 \
+    --benchmark-list mtbench gsm8k:5 ceval:5:accountant \
+    --dtype bfloat16
+```
+### Launch Benchmarker Independently
+If you want to launch the SGLang server independently, you can use the following command.
+```shell
+# you can launch a server
+python3 -m sglang.launch_server \
+    --model meta-llama/Llama-3.1-8B-Instruct   \
+    --speculative-algorithm EAGLE3 \
+    --speculative-draft-model-path lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B \
+    --speculative-num-steps 3 \
+    --speculative-eagle-topk 1 \
+    --speculative-num-draft-tokens 4 \
+    --mem-fraction-static 0.75 \
+    --cuda-graph-max-bs 1 \
+    --tp 1 \
+    --trust-remote-code \
+    --host 0.0.0.0 \
+    --port 30000 \
+    --dtype bfloat16
+```
+Then we can start benchmarking. Note that you should use the same host and port as the one used in the SGLang server. Note that `--skip-launch-server` is required to skip the launch of the SGLang server.
+```bash
+python bench_eagle3.py \
+        --model-path meta-llama/Llama-3.1-8B-Instruct \
+        --port 30000 \
+        --config-list 1,3,1,4 \
+        --benchmark-list mtbench:5 ceval:5:accountant gsm8k:5 humaneval:5 math500:5 mtbench:5 aime:1 \
+        --skip-launch-server
+```

SpecForge-ext/benchmarks/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+"""
+Benchmark scripts for speculative decoding evaluation.
+"""

SpecForge-ext/benchmarks/bench_eagle3.py ADDED Viewed

	@@ -0,0 +1,268 @@

+#!/usr/bin/env python3
+"""
+Usage:
+# if you want to run benchmarks directly
+# mtbench:20 means only run 20 samples in the dataset
+python bench_eagle3.py \
+    --model meta-llama/Llama-3.1-8B-Instruct   \
+    --speculative-algorithm EAGLE3 \
+    --speculative-draft-model-path lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B \
+    --port 30000 \
+    --config-list 1,0,0,0 1,3,1,4 \
+    --benchmark-list mtbench:20 \
+    --dtype bfloat16
+or if you want run sglang alone.
+# launch sglang
+python3 -m sglang.launch_server \
+    --model meta-llama/Llama-3.1-8B-Instruct   \
+    --speculative-algorithm EAGLE3 \
+    --speculative-draft-model-path lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B \
+    --speculative-num-steps 3 \
+    --speculative-eagle-topk 1 \
+    --speculative-num-draft-tokens 4 \
+    --mem-fraction-static 0.75 \
+    --cuda-graph-max-bs 1 \
+    --tp 1 \
+    --trust-remote-code \
+    --host 0.0.0.0 \
+    --port 30000 \
+    --dtype bfloat16
+# then run benchmarks
+python bench_eagle3.py \
+    --model-path meta-llama/Llama-3.1-8B-Instruct \
+    --port 30000 \
+    --config-list 1,0,0,0 \
+    --benchmark-list mtbench:80 \
+    --dtype bfloat16 \
+    --skip-launch-server
+"""
+import argparse
+import json
+import os
+import time
+from dataclasses import asdict
+from typing import List
+import requests
+from benchmarker import BENCHMARKS
+from sglang.srt.server_args import ServerArgs
+from sglang.test.test_utils import kill_process_tree, popen_launch_server
+from sglang.utils import wait_for_server
+def parse_args():
+    parser = argparse.ArgumentParser()
+    sglang_group = parser.add_argument_group("sglang")
+    ServerArgs.add_cli_args(sglang_group)
+    # make the follow args a group
+    benchmark_group = parser.add_argument_group("benchmark")
+    benchmark_group.add_argument(
+        "--skip-launch-server", action="store_true", default=False
+    )
+    benchmark_group.add_argument("--timeout-for-server-launch", type=int, default=600)
+    benchmark_group.add_argument("--num-prompts", type=int, default=80)
+    benchmark_group.add_argument("--output-dir", type=str, default="./results")
+    benchmark_group.add_argument(
+        "--config-list", type=str, nargs="+", default=["1,0,0,0", "1,3,1,4"]
+    )
+    benchmark_group.add_argument(
+        "--name",
+        type=str,
+        default=None,
+        help="name of this benchmark run, if provided, will be added to the output file name",
+    )
+    benchmark_group.add_argument(
+        "--benchmark-list",
+        type=str,
+        nargs="+",
+        default=[
+            "mtbench:80",
+            "gsm8k:200",
+            "humaneval:200",
+            "math500:200",
+            "ceval:200",
+        ],
+        help=f"The list of benchmarks to run. The format is <benchmark-name>:<num-prompts>:<subset>,<subset>. We support the following benchmarks: {', '.join(BENCHMARKS.benchmarks.keys())}",
+    )
+    benchmark_group.add_argument(
+        "--enable-multi-turn-conversation",
+        action="store_true",
+        default=False,
+    )
+    return parser.parse_args()
+def launch_sglang_server(
+    server_args: ServerArgs,
+    base_url: str,
+    batch_size: int,
+    steps: int,
+    topk: int,
+    num_draft_tokens: int,
+    timeout: int,
+):
+    """
+    This function launches the SGLang server with the given server arguments.
+    """
+    sglang_args: List[str] = []
+    if steps > 0:
+        sglang_args.extend(
+            [
+                "--speculative-algorithm",
+                "EAGLE3",
+                "--speculative-num-steps",
+                str(steps),
+                "--speculative-eagle-topk",
+                str(topk),
+                "--speculative-num-draft-tokens",
+                str(num_draft_tokens),
+                "--speculative-draft-model-path",
+                server_args.speculative_draft_model_path,
+            ]
+        )
+    sglang_args.extend(
+        [
+            "--cuda-graph-max-bs",
+            str(batch_size),
+            "--mem-fraction-static",
+            str(server_args.mem_fraction_static),
+            "--tp-size",
+            str(server_args.tp_size),
+            "--max-running-requests",
+            str(batch_size),
+        ]
+    )
+    if server_args.trust_remote_code:
+        sglang_args.extend(["--trust-remote-code"])
+    if server_args.disable_radix_cache:
+        sglang_args.extend(["--disable-radix-cache"])
+    if server_args.ep_size:
+        sglang_args.extend(["--ep-size", str(server_args.ep_size)])
+    if server_args.attention_backend:
+        sglang_args.extend(["--attention-backend", server_args.attention_backend])
+    if server_args.quantization:
+        sglang_args.extend(["--quantization", server_args.quantization])
+    if server_args.dtype:
+        sglang_args.extend(["--dtype", server_args.dtype])
+    process = popen_launch_server(
+        server_args.model_path,
+        base_url,
+        timeout=timeout,
+        other_args=sglang_args,
+        env={
+            "SGLANG_RECORD_STEP_TIME": "1",
+            "SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN": "1",
+            **os.environ,
+        },
+    )
+    return process
+def send_flush_cache_request(base_url: str):
+    requests.post(base_url + "/flush_cache")
+def main():
+    args = parse_args()
+    server_args: ServerArgs = ServerArgs.from_cli_args(args)
+    configs = [tuple(map(int, config.split(","))) for config in args.config_list]
+    # split the arg into list of (bench_name, num_prompts)
+    benchmark_list = []
+    for item in args.benchmark_list:
+        splits = item.split(":")
+        if len(splits) == 1:
+            bench_name = splits[0]
+            num_prompts = None
+            subset = None
+        elif len(splits) == 2:
+            bench_name, num_prompts = splits
+            subset = None
+        elif len(splits) == 3:
+            bench_name, num_prompts, subset = splits
+            subset = subset.split(",")
+        else:
+            raise ValueError(f"Invalid benchmark list format: {item}")
+        benchmark_list.append((bench_name, num_prompts, subset))
+    assert len(benchmark_list) != 0, "the number of benchmark list is 0"
+    base_url = f"http://localhost:{args.port}"
+    results = {}
+    results["model"] = server_args.speculative_draft_model_path
+    def run_benchmarks(batch_size: int, steps: int, topk: int, num_draft_tokens: int):
+        for benchmark_name, num_prompts, subset in benchmark_list:
+            print(
+                f"Running benchmark {benchmark_name} with {num_prompts} prompts, batch size {batch_size}, steps {steps}, topk {topk}, num_draft_tokens {num_draft_tokens}, subset {subset}"
+            )
+            benchmarkder_cls = BENCHMARKS.get(benchmark_name)
+            num_prompts = int(num_prompts) if num_prompts is not None else None
+            if subset is None:
+                benchmarker = benchmarkder_cls(num_samples=num_prompts)
+            else:
+                benchmarker = benchmarkder_cls(num_samples=num_prompts, subset=subset)
+            metrics_list = benchmarker.run(
+                host=args.host, port=args.port, batch_size=batch_size
+            )
+            send_flush_cache_request(base_url)
+            if benchmark_name not in results:
+                results[benchmark_name] = []
+            results[benchmark_name].append(
+                dict(
+                    batch_size=batch_size,
+                    steps=steps,
+                    topk=topk,
+                    num_draft_tokens=num_draft_tokens,
+                    metrics=[asdict(metric) for metric in metrics_list],
+                    num_samples=num_prompts,
+                )
+            )
+    if args.skip_launch_server:
+        batch_size = configs[0][0] if len(configs) > 0 else 8
+        run_benchmarks(batch_size, None, None, None)
+    else:
+        # we itearate over each config from args
+        for batch_size, steps, topk, num_draft_tokens in configs:
+            process = launch_sglang_server(
+                server_args,
+                base_url,
+                batch_size,
+                steps,
+                topk,
+                num_draft_tokens,
+                args.timeout_for_server_launch,
+            )
+            wait_for_server(base_url)
+            run_benchmarks(batch_size, steps, topk, num_draft_tokens)
+            kill_process_tree(process.pid)
+            process.wait()
+    os.makedirs(args.output_dir, exist_ok=True)
+    timestamp = time.strftime("%Y%m%d_%H%M%S")
+    result_file = os.path.join(
+        args.output_dir,
+        f"{args.name + '_' if args.name else ''}results_{timestamp}.jsonl",
+    )
+    with open(result_file, "w") as f:
+        json.dump(results, f, indent=4)
+    print(f"Results saved to {result_file}")
+if __name__ == "__main__":
+    main()

SpecForge-ext/benchmarks/benchmarker/__init__.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from .aime import AIMEBenchmarker
+from .ceval import CEvalBenchmarker
+from .financeqa import FinanceQABenchmarker
+from .gpqa import GPQABenchmarker
+from .gsm8k import GSM8KBenchmarker
+from .humaneval import HumanEvalBenchmarker
+from .livecodebench import LCBBenchmarker
+from .math500 import Math500Benchmarker
+from .mmlu import MMLUBenchmarker
+from .mmstar import MMStarBenchmarker
+from .mtbench import MTBenchBenchmarker
+from .registry import BENCHMARKS
+from .simpleqa import SimpleQABenchmarker
+__all__ = [
+    "BENCHMARKS",
+    "AIMEBenchmarker",
+    "CEvalBenchmarker",
+    "GSM8KBenchmarker",
+    "HumanEvalBenchmarker",
+    "Math500Benchmarker",
+    "MTBenchBenchmarker",
+    "MMStarBenchmarker",
+    "GPQABenchmarker",
+    "FinanceQABenchmarker",
+    "MMLUBenchmarker",
+    "LCBBenchmarker",
+    "SimpleQABenchmarker",
+]

SpecForge-ext/benchmarks/benchmarker/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (872 Bytes). View file

SpecForge-ext/benchmarks/benchmarker/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (1.11 kB). View file

SpecForge-ext/benchmarks/benchmarker/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (896 Bytes). View file

SpecForge-ext/benchmarks/benchmarker/__pycache__/aime.cpython-310.pyc ADDED Viewed

Binary file (4.16 kB). View file

SpecForge-ext/benchmarks/benchmarker/__pycache__/aime.cpython-311.pyc ADDED Viewed

Binary file (6.79 kB). View file

SpecForge-ext/benchmarks/benchmarker/__pycache__/aime.cpython-312.pyc ADDED Viewed

Binary file (5.8 kB). View file

SpecForge-ext/benchmarks/benchmarker/__pycache__/base.cpython-310.pyc ADDED Viewed

Binary file (6.47 kB). View file

SpecForge-ext/benchmarks/benchmarker/__pycache__/base.cpython-311.pyc ADDED Viewed

Binary file (9.05 kB). View file

SpecForge-ext/benchmarks/benchmarker/__pycache__/base.cpython-312.pyc ADDED Viewed

Binary file (8.11 kB). View file

SpecForge-ext/benchmarks/benchmarker/__pycache__/ceval.cpython-310.pyc ADDED Viewed

Binary file (6.57 kB). View file

SpecForge-ext/benchmarks/benchmarker/__pycache__/ceval.cpython-311.pyc ADDED Viewed

Binary file (11.7 kB). View file

SpecForge-ext/benchmarks/benchmarker/__pycache__/financeqa.cpython-310.pyc ADDED Viewed

Binary file (2.07 kB). View file

SpecForge-ext/benchmarks/benchmarker/__pycache__/financeqa.cpython-311.pyc ADDED Viewed

Binary file (3.34 kB). View file

SpecForge-ext/benchmarks/benchmarker/__pycache__/gpqa.cpython-310.pyc ADDED Viewed

Binary file (3.24 kB). View file

SpecForge-ext/benchmarks/benchmarker/__pycache__/gpqa.cpython-311.pyc ADDED Viewed

Binary file (5.36 kB). View file

SpecForge-ext/benchmarks/benchmarker/__pycache__/gsm8k.cpython-310.pyc ADDED Viewed

Binary file (3.94 kB). View file

SpecForge-ext/benchmarks/benchmarker/__pycache__/gsm8k.cpython-311.pyc ADDED Viewed

Binary file (6.72 kB). View file

SpecForge-ext/benchmarks/benchmarker/__pycache__/humaneval.cpython-310.pyc ADDED Viewed

Binary file (4.88 kB). View file

SpecForge-ext/benchmarks/benchmarker/__pycache__/humaneval.cpython-311.pyc ADDED Viewed

Binary file (8.95 kB). View file

SpecForge-ext/benchmarks/benchmarker/__pycache__/livecodebench.cpython-310.pyc ADDED Viewed

Binary file (1.87 kB). View file

SpecForge-ext/benchmarks/benchmarker/__pycache__/math500.cpython-310.pyc ADDED Viewed

Binary file (3.73 kB). View file

SpecForge-ext/benchmarks/benchmarker/__pycache__/math500.cpython-311.pyc ADDED Viewed

Binary file (6.26 kB). View file

SpecForge-ext/benchmarks/benchmarker/__pycache__/mmlu.cpython-310.pyc ADDED Viewed

Binary file (3.11 kB). View file

SpecForge-ext/benchmarks/benchmarker/__pycache__/mmlu.cpython-311.pyc ADDED Viewed

Binary file (5.19 kB). View file

SpecForge-ext/benchmarks/benchmarker/__pycache__/mmstar.cpython-310.pyc ADDED Viewed

Binary file (5.08 kB). View file

SpecForge-ext/benchmarks/benchmarker/__pycache__/mmstar.cpython-311.pyc ADDED Viewed

Binary file (10.1 kB). View file

SpecForge-ext/benchmarks/benchmarker/__pycache__/mtbench.cpython-310.pyc ADDED Viewed

Binary file (3.07 kB). View file

SpecForge-ext/benchmarks/benchmarker/__pycache__/mtbench.cpython-311.pyc ADDED Viewed

Binary file (4.63 kB). View file

SpecForge-ext/benchmarks/benchmarker/__pycache__/registry.cpython-310.pyc ADDED Viewed

Binary file (1.22 kB). View file

SpecForge-ext/benchmarks/benchmarker/__pycache__/registry.cpython-311.pyc ADDED Viewed

Binary file (1.51 kB). View file

SpecForge-ext/benchmarks/benchmarker/__pycache__/simpleqa.cpython-310.pyc ADDED Viewed

Binary file (1.81 kB). View file

SpecForge-ext/benchmarks/benchmarker/__pycache__/simpleqa.cpython-311.pyc ADDED Viewed

Binary file (2.83 kB). View file

SpecForge-ext/benchmarks/benchmarker/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (8.59 kB). View file

SpecForge-ext/benchmarks/benchmarker/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (13.7 kB). View file

SpecForge-ext/benchmarks/benchmarker/aime.py ADDED Viewed

	@@ -0,0 +1,133 @@

+"""
+AIME benchmark
+"""
+import re
+from typing import Any, Dict, List, Optional, Tuple
+from datasets import load_dataset
+from .base import Benchmarker
+from .registry import BENCHMARKS
+from .utils import create_simple_sgl_function
+def extract_aime_answer(output: str) -> Optional[str]:
+    """Extract final answer from AIME problem solution.
+    AIME answers are typically integers between 0 and 999, and are usually
+    in \boxed{} format.
+    """
+    # Try to find answer in \boxed{} format
+    boxed_pattern = r"\\boxed\{([^}]+)\}"
+    match = re.search(boxed_pattern, output)
+    if match:
+        answer = match.group(1).strip()
+        # Extract number from the boxed content
+        numbers = re.findall(r"\d+", answer)
+        if numbers:
+            return numbers[-1]  # Take the last number (usually the final answer)
+        return answer
+    # Try to find answer in \boxed format (without braces)
+    boxed_pattern2 = r"\\boxed\s+(\d+)"
+    match = re.search(boxed_pattern2, output)
+    if match:
+        return match.group(1).strip()
+    # Look for patterns like "The answer is 42" or "Answer: 123"
+    answer_patterns = [
+        r"(?:answer|Answer|ANSWER)[\s:]+(\d+)",
+        r"(?:final\s+answer|Final\s+Answer)[\s:]+(\d+)",
+        r"(?:is|equals?|=\s*)(\d+)\s*$",
+    ]
+    for pattern in answer_patterns:
+        matches = re.findall(pattern, output, re.IGNORECASE)
+        if matches:
+            return matches[-1].strip()
+    # Fallback: extract the last integer in the text
+    numbers = re.findall(r"\b(\d+)\b", output)
+    if numbers:
+        # Filter to reasonable AIME answer range (0-999)
+        valid_numbers = [n for n in numbers if 0 <= int(n) <= 999]
+        if valid_numbers:
+            return valid_numbers[-1]
+    return None
+@BENCHMARKS.register("aime")
+class AIMEBenchmarker(Benchmarker):
+    """AIME benchmark implementation."""
+    def __init__(self, num_samples: Optional[int] = None):
+        super().__init__(num_samples, None)
+    def load_data(self) -> Tuple[List[Dict[str, Any]], List[Optional[str]]]:
+        """Load and preprocess AIME dataset."""
+        dataset = load_dataset("Maxwell-Jia/AIME_2024")["train"]
+        questions = []
+        labels = []
+        for idx, q in enumerate(dataset):
+            if self.num_samples is not None and idx >= self.num_samples:
+                break
+            questions.append({"question": q["Problem"]})
+            # Extract answer from Answer field
+            answer = None
+            if "Answer" in q:
+                answer = str(q["Answer"]).strip()
+            elif "answer" in q:
+                answer = str(q["answer"]).strip()
+            labels.append(answer)
+        return questions, labels
+    def extract_answer(self, output: str, label: Optional[Any] = None) -> Optional[str]:
+        """Extract answer from model output."""
+        return extract_aime_answer(output)
+    def compute_accuracy(
+        self, predictions: List[Any], labels: List[Any]
+    ) -> Optional[float]:
+        """Compute accuracy for AIME by comparing numeric answers."""
+        if not labels or len(labels) == 0:
+            return None
+        if all(label is None for label in labels):
+            return None
+        correct = 0
+        valid_count = 0
+        for pred, label in zip(predictions, labels):
+            if label is not None:
+                valid_count += 1
+                if pred is not None:
+                    # Normalize answers for comparison
+                    pred_normalized = str(pred).strip()
+                    label_normalized = str(label).strip()
+                    # Try exact match first
+                    if pred_normalized == label_normalized:
+                        correct += 1
+                    else:
+                        # Try numeric comparison
+                        try:
+                            pred_num = int(pred_normalized)
+                            label_num = int(label_normalized)
+                            if pred_num == label_num:
+                                correct += 1
+                        except ValueError:
+                            pass
+        return correct / valid_count if valid_count > 0 else 0.0
+    def create_sgl_function(self):
+        """Create SGL function for AIME with reasoning prompt."""
+        return create_simple_sgl_function(
+            function_name="reasoning_gen",
+            answer_key="answer",
+            user_prefix="\nPlease reason step by step, and put your final answer within \\boxed{}.",
+        )
+    def get_max_new_tokens(self) -> int:
+        """AIME problems require more tokens."""
+        return 32768

SpecForge-ext/benchmarks/benchmarker/base.py ADDED Viewed

	@@ -0,0 +1,218 @@

+"""
+Base class for benchmark implementations.
+"""
+import time
+from abc import ABC, abstractmethod
+from argparse import Namespace
+from typing import Any, Callable, Dict, List, Optional, Tuple
+from sglang import set_default_backend
+from sglang.test.test_utils import select_sglang_backend
+from .utils import compute_metrics
+class Benchmarker(ABC):
+    """
+    Base class for benchmark implementations.
+    Subclasses should implement:
+    - load_data(): Load and preprocess dataset
+    - create_sgl_function(): Create the SGL function for inference
+    Optional overrides:
+    - extract_answer(): Extract answer from model output (if needed)
+    - compute_accuracy(): Compute accuracy metric (if applicable)
+    - get_answer_keys(): Get list of answer keys for multi-turn conversations
+    Args:
+        num_samples: The number of samples to run the benchmark on. If not provided, all questions will be used.
+        subset: The subset of the dataset to run the benchmark on. If not provided, all subsets will be used.
+    """
+    def __init__(
+        self, num_samples: Optional[int] = None, subset: Optional[List[str]] = None
+    ):
+        self.num_samples = num_samples
+        self.subset = subset
+    @abstractmethod
+    def load_data(self) -> Tuple[List[Dict[str, Any]], List[Any]]:
+        """
+        Load and preprocess the dataset.
+        Returns:
+            Tuple of (questions, labels) where:
+            - questions: List of question dicts for SGL function
+            - labels: List of ground truth labels (can be None if not applicable)
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def create_sgl_function(self) -> Callable:
+        """
+        Create the SGL function for inference.
+        Returns:
+            SGL function decorated with @sgl.function
+        """
+        raise NotImplementedError
+    def extract_answer(self, output: str, label: Optional[Any] = None) -> Optional[Any]:
+        """
+        Extract answer from model output.
+        Args:
+            output: Raw model output string
+            label: Optional ground truth label for reference
+        Returns:
+            Extracted answer, or None if extraction fails
+        """
+        return output
+    def compute_accuracy(
+        self, predictions: List[Any], labels: List[Any]
+    ) -> Optional[float]:
+        """
+        Compute accuracy metric.
+        Args:
+            predictions: List of predicted answers
+            labels: List of ground truth labels
+        Returns:
+            Accuracy score (0-1), or None if not applicable
+        """
+        return None
+    def get_answer_keys(self) -> Optional[List[str]]:
+        """
+        Get list of answer keys for multi-turn conversations.
+        Returns:
+            List of answer keys (e.g., ["answer_1", "answer_2"]), or None for single-turn
+        """
+        return None
+    def get_max_new_tokens(self) -> int:
+        """
+        Get maximum number of new tokens to generate.
+        Returns:
+            Maximum tokens (default: 2048)
+        """
+        return 2048
+    def run(
+        self,
+        host: str,
+        port: int,
+        batch_size: int,
+        max_new_tokens: int = None,
+        num_runs: int = 1,
+    ):
+        """
+        Run the benchmark evaluation.
+        This method handles the common workflow:
+        1. Initialize backend
+        2. Load data
+        3. Create SGL function
+        4. Run inference loops
+        5. Compute metrics
+        6. Print results
+        Args:
+            host (str): The host of the SGLang server
+            port (int): The port of the SGLang server
+            batch_size (int): The number of prompts to process in parallel
+            num_samples (int): The number of samples to run the benchmark on. If not provided, all samples will be used.
+            max_new_tokens (int): Maximum number of new tokens to generate, default is 2048
+            num_runs (int): The number of times to run this benchmark, default is 1. You can set it to a larger number if you want to get more stable results.
+        """
+        if not host.startswith(("http://", "https://")):
+            host = f"http://{host}"
+        # Initialize backend
+        sglang_args = Namespace(host=host, port=port, backend="srt-no-parallel")
+        set_default_backend(select_sglang_backend(sglang_args))
+        # Load data
+        questions, labels = self.load_data()
+        if len(questions) == 0:
+            print("No valid questions found. Please check the dataset format.")
+            return
+        # Create SGL function
+        sgl_function = self.create_sgl_function()
+        # Run evaluation loops
+        metrics_list = []
+        answer_keys = self.get_answer_keys()
+        max_new_tokens = max_new_tokens or self.get_max_new_tokens()
+        for _ in range(num_runs):
+            tic = time.perf_counter()
+            states = sgl_function.run_batch(
+                questions,
+                temperature=0,
+                max_new_tokens=max_new_tokens,
+                num_threads=batch_size,
+                progress_bar=True,
+            )
+            latency = time.perf_counter() - tic
+            # Extract predictions
+            predictions = []
+            primary_answer_key = answer_keys[0] if answer_keys else "answer"
+            for i in range(len(states)):
+                # Access answer from state object (states[i] supports dict-like access)
+                output = states[i][primary_answer_key]
+                if isinstance(output, str):
+                    extracted = self.extract_answer(
+                        output,
+                        (labels[i] if labels and i < len(labels) else None),
+                    )
+                else:
+                    extracted = output
+                predictions.append(extracted)
+            # Compute accuracy if applicable
+            accuracy = None
+            # Check if we have a labels list (even if all labels are None)
+            has_labels_list = labels and len(labels) > 0
+            if has_labels_list:
+                # Always call compute_accuracy if we have a labels list
+                # This allows it to return None, which will be displayed in print_results
+                accuracy = self.compute_accuracy(predictions, labels)
+                if accuracy is not None:
+                    valid_count = sum(1 for p in predictions if p is not None)
+                    if valid_count < len(predictions):
+                        print(
+                            f"Warning: {len(predictions) - valid_count} predictions could not be extracted."
+                        )
+            # Compute performance metrics
+            metrics = compute_metrics(
+                states,
+                latency,
+                answer_key=primary_answer_key,
+                additional_answer_keys=(
+                    answer_keys[1:] if answer_keys and len(answer_keys) > 1 else None
+                ),
+            )
+            # Always set accuracy if we have a labels list (even if compute_accuracy returns None)
+            # This allows print_results to show None when compute_accuracy returns None
+            if has_labels_list:
+                metrics.accuracy = (
+                    accuracy  # Can be None if compute_accuracy returns None
+                )
+                if accuracy is not None:
+                    metrics.num_valid_predictions = sum(
+                        1 for p in predictions if p is not None
+                    )
+            metrics_list.append(metrics)
+        return metrics_list

SpecForge-ext/benchmarks/benchmarker/ceval.py ADDED Viewed

	@@ -0,0 +1,267 @@

+"""
+C-Eval benchmark evaluation script.
+"""
+import re
+from typing import Any, Dict, List, Optional, Tuple
+from datasets import concatenate_datasets, load_dataset
+from .base import Benchmarker
+from .registry import BENCHMARKS
+from .utils import create_simple_sgl_function
+def extract_answer(answer_str: str) -> str:
+    """Extract the answer choice (A, B, C, D) from the model output."""
+    # Try to find the answer in various formats
+    answer_str = answer_str.strip().upper()
+    # Direct match for single letter
+    match = re.search(r"\b([ABCD])\b", answer_str)
+    if match:
+        return match.group(1)
+    # Try to find answer in parentheses or brackets
+    for pattern in [
+        r"\(([ABCD])\)",
+        r"\[([ABCD])\]",
+        r"答案[：:]\s*([ABCD])",
+        r"Answer[：:]\s*([ABCD])",
+    ]:
+        match = re.search(pattern, answer_str, re.IGNORECASE)
+        if match:
+            return match.group(1).upper()
+    # Try to find the first occurrence of A, B, C, or D
+    match = re.search(r"([ABCD])", answer_str)
+    if match:
+        return match.group(1)
+    return None
+def format_question(question: str, options: List[str]) -> str:
+    """Format the question with options."""
+    prompt = question + "\n\n选项：\n"
+    for i, option in enumerate(options):
+        prompt += f"{chr(65 + i)}. {option}\n"
+    prompt += "\n请从A、B、C、D中选择一个答案。"
+    return prompt
+@BENCHMARKS.register("ceval")
+class CEvalBenchmarker(Benchmarker):
+    """C-Eval benchmark implementation."""
+    def __init__(
+        self, num_samples: Optional[int] = None, subset: Optional[List[str]] = None
+    ):
+        if subset is None:
+            subset = "all"
+        super().__init__(num_samples, subset)
+    def load_data(self) -> Tuple[List[Dict[str, Any]], List[str]]:
+        """Load and preprocess C-Eval dataset."""
+        all_configs = [
+            "accountant",
+            "advanced_mathematics",
+            "art_studies",
+            "basic_medicine",
+            "business_administration",
+            "chinese_language_and_literature",
+            "civil_servant",
+            "clinical_medicine",
+            "college_chemistry",
+            "college_economics",
+            "college_physics",
+            "college_programming",
+            "computer_architecture",
+            "computer_network",
+            "discrete_mathematics",
+            "education_science",
+            "electrical_engineer",
+            "environmental_impact_assessment_engineer",
+            "fire_engineer",
+            "high_school_biology",
+            "high_school_chemistry",
+            "high_school_chinese",
+            "high_school_geography",
+            "high_school_history",
+            "high_school_mathematics",
+            "high_school_physics",
+            "high_school_politics",
+            "ideological_and_moral_cultivation",
+            "law",
+            "legal_professional",
+            "logic",
+            "mao_zedong_thought",
+            "marxism",
+            "metrology_engineer",
+            "middle_school_biology",
+            "middle_school_chemistry",
+            "middle_school_geography",
+            "middle_school_history",
+            "middle_school_mathematics",
+            "middle_school_physics",
+            "middle_school_politics",
+            "modern_chinese_history",
+            "operating_system",
+            "physician",
+            "plant_protection",
+            "probability_and_statistics",
+            "professional_tour_guide",
+            "sports_science",
+            "tax_accountant",
+            "teacher_qualification",
+            "urban_and_rural_planner",
+            "veterinary_medicine",
+        ]
+        # Select configs to load
+        if self.subset == "all":
+            configs_to_load = all_configs
+        else:
+            for subset in self.subset:
+                assert (
+                    subset in all_configs
+                ), f"Subset {subset} not found in C-Eval dataset"
+            configs_to_load = self.subset
+        # Load datasets
+        try:
+            datasets = []
+            for config in configs_to_load:
+                try:
+                    ds = load_dataset("ceval/ceval-exam", name=config, split="test")
+                    datasets.append(ds)
+                    print(f"Loaded config '{config}' with {len(ds)} samples")
+                except Exception as e:
+                    print(f"Warning: Failed to load config '{config}': {e}")
+            if len(datasets) == 0:
+                raise ValueError("No configs could be loaded")
+            dataset = concatenate_datasets(datasets)
+            print(
+                f"Successfully loaded C-Eval dataset with all configs (total: {len(dataset)} samples)"
+            )
+        except Exception as e:
+            print(e)
+            print(f"Failed to load C-Eval dataset from 'ceval/ceval-exam': {e}")
+            print("Please ensure the dataset is available or install it manually.")
+            print("You can try: pip install datasets")
+            print("Or download from: https://huggingface.co/datasets/ceval/ceval-exam")
+            return [], []
+        # Process questions
+        questions = []
+        labels = []
+        for idx, item in enumerate(dataset):
+            if self.num_samples is not None and idx >= self.num_samples:
+                break
+            # Handle different dataset formats
+            question_text = None
+            if "question" in item:
+                question_text = item["question"]
+            elif "inputs" in item:
+                question_text = item["inputs"]
+            elif "problem" in item:
+                question_text = item["problem"]
+            elif "content" in item:
+                question_text = item["content"]
+            if not question_text:
+                continue
+            # Get options - C-Eval typically has options as a list or dict
+            options = None
+            if "options" in item:
+                options = item["options"]
+                if isinstance(options, dict):
+                    # Convert dict to list in order A, B, C, D
+                    options = [
+                        options.get("A", ""),
+                        options.get("B", ""),
+                        options.get("C", ""),
+                        options.get("D", ""),
+                    ]
+                elif isinstance(options, list):
+                    # Ensure we have 4 options
+                    while len(options) < 4:
+                        options.append("")
+            elif "choices" in item:
+                options = item["choices"]
+                if isinstance(options, dict):
+                    options = [
+                        options.get("A", ""),
+                        options.get("B", ""),
+                        options.get("C", ""),
+                        options.get("D", ""),
+                    ]
+            else:
+                # Try to construct options from A, B, C, D fields
+                options = [
+                    item.get("A", item.get("option_A", "")),
+                    item.get("B", item.get("option_B", "")),
+                    item.get("C", item.get("option_C", "")),
+                    item.get("D", item.get("option_D", "")),
+                ]
+            # Filter out empty options
+            if options:
+                options = [str(opt).strip() for opt in options if opt]
+                if len(options) < 2:  # Need at least 2 options
+                    continue
+            else:
+                continue
+            # Get answer
+            answer = None
+            if "answer" in item:
+                answer = str(item["answer"]).upper().strip()
+            elif "target" in item:
+                answer = str(item["target"]).upper().strip()
+            elif "label" in item:
+                answer = str(item["label"]).upper().strip()
+            elif "correct" in item:
+                answer = str(item["correct"]).upper().strip()
+            # Validate answer
+            if answer and answer in ["A", "B", "C", "D"]:
+                # Format question
+                formatted_question = format_question(question_text, options)
+                questions.append({"question": formatted_question})
+                labels.append(answer)
+        if len(questions) == 0:
+            print("No valid questions found. Please check the dataset format.")
+            print(
+                "Sample item keys:",
+                list(dataset[0].keys()) if len(dataset) > 0 else "No items",
+            )
+            return [], []
+        return questions, labels
+    def create_sgl_function(self):
+        """Create SGL function for C-Eval."""
+        return create_simple_sgl_function(
+            function_name="get_ceval_answer",
+            answer_key="answer",
+            max_tokens=self.get_max_new_tokens(),
+        )
+    def extract_answer(self, output: str, label: Any = None) -> str:
+        """Extract answer choice from model output."""
+        return extract_answer(output)
+    def compute_accuracy(self, predictions: List[str], labels: List[str]) -> float:
+        """Compute accuracy metric."""
+        correct = 0
+        valid_count = 0
+        for i in range(len(predictions)):
+            if predictions[i] is not None:  # Only count valid predictions
+                valid_count += 1
+                if predictions[i] == labels[i]:
+                    correct += 1
+        return correct / valid_count if valid_count > 0 else 0.0

SpecForge-ext/benchmarks/benchmarker/financeqa.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from typing import Any, Dict, List, Optional, Tuple
+from datasets import load_dataset
+from .base import Benchmarker
+from .registry import BENCHMARKS
+from .utils import create_simple_sgl_function
+QUESTION_PROMPT = """
+Given the following context:
+{context}
+Can you answer the following question?
+{question}
+""".strip()
+def generate_question(row: Dict[str, Any]) -> str:
+    if row["context"] is None:
+        return row["question"].strip()
+    else:
+        question = QUESTION_PROMPT.format(
+            context=row["context"].strip(),
+            question=row["question"].strip(),
+        )
+        return question
+@BENCHMARKS.register("financeqa")
+class FinanceQABenchmarker(Benchmarker):
+    """FinanceQA benchmark implementation."""
+    def __init__(self, num_samples: Optional[int] = None):
+        super().__init__(num_samples, None)
+    def load_data(self) -> Tuple[List[Dict[str, Any]], List[int]]:
+        """Load and preprocess FinanceQA dataset."""
+        # Read data
+        ds = load_dataset("AfterQuery/FinanceQA")["test"]
+        questions = []
+        labels = []
+        for i in range((len(ds))):
+            if self.num_samples is not None and i >= self.num_samples:
+                break
+            question_text = generate_question(ds[i])
+            questions.append({"question": question_text})
+            labels.append(None)
+        return questions, labels
+    def create_sgl_function(self):
+        return create_simple_sgl_function(
+            function_name="get_financeqa_answer",
+            answer_key="answer",
+            max_tokens=self.get_max_new_tokens(),
+        )

SpecForge-ext/benchmarks/benchmarker/gpqa.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import random
+from typing import Any, Dict, List, Optional, Tuple
+from datasets import load_dataset
+from .base import Benchmarker
+from .registry import BENCHMARKS
+from .utils import create_simple_sgl_function
+GPQA_QUERY_TEMPLATE = """
+Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.
+{Question}
+A) {A}
+B) {B}
+C) {C}
+D) {D}
+""".strip()
+def generate_question(row: Dict[str, Any]) -> str:
+    gold_index = random.randint(0, 3)
+    choices = [
+        row["Incorrect Answer 1"],
+        row["Incorrect Answer 2"],
+        row["Incorrect Answer 3"],
+    ]
+    choices.insert(gold_index, row["Correct Answer"])
+    question = GPQA_QUERY_TEMPLATE.format(
+        Question=row["Question"].strip(),
+        A=choices[0].strip(),
+        B=choices[1].strip(),
+        C=choices[2].strip(),
+        D=choices[3].strip(),
+    )
+    # 0 means A, 1 means B, 2 means C, 3 means D
+    answer = ["A", "B", "C", "D"][gold_index]
+    return question, answer
+@BENCHMARKS.register("gpqa")
+class GPQABenchmarker(Benchmarker):
+    """GPQA benchmark implementation."""
+    def __init__(self, num_samples: Optional[int] = None):
+        super().__init__(num_samples, None)
+    def load_data(self) -> Tuple[List[Dict[str, Any]], List[int]]:
+        """Load and preprocess GPQA dataset."""
+        # Read data
+        ds = load_dataset("Idavidrein/gpqa", "gpqa_main")["train"]
+        questions = []
+        labels = []
+        for i in range((len(ds))):
+            if self.num_samples is not None and i >= self.num_samples:
+                break
+            question_text, answer = generate_question(ds[i])
+            questions.append({"question": question_text})
+            labels.append(answer)
+        return questions, labels
+    def extract_answer(self, output: str, label: Optional[Any] = None) -> Optional[int]:
+        if "Answer: " not in output:
+            return None
+        return output.split("Answer: ")[1].strip()
+    def compute_accuracy(
+        self, predictions: List[Any], labels: List[Any]
+    ) -> Optional[float]:
+        if not labels or len(labels) == 0:
+            return None
+        correct = sum(1 for pred, label in zip(predictions, labels) if pred == label)
+        return correct / len(labels) if len(labels) > 0 else 0.0
+    def create_sgl_function(self):
+        return create_simple_sgl_function(
+            function_name="get_gpqa_mcq_answer",
+            answer_key="answer",
+            max_tokens=self.get_max_new_tokens(),
+        )

SpecForge-ext/benchmarks/benchmarker/gsm8k.py ADDED Viewed

	@@ -0,0 +1,108 @@

+"""
+GSM8K benchmark evaluation script.
+"""
+import ast
+import os
+import re
+from typing import Any, Dict, List, Optional, Tuple
+from sglang.utils import download_and_cache_file, read_jsonl
+from .base import Benchmarker
+from .registry import BENCHMARKS
+from .utils import create_few_shot_sgl_function
+INVALID = -9999999
+def get_one_example(lines: List[Dict], i: int, include_answer: bool) -> str:
+    """Format a single example."""
+    ret = "Question: " + lines[i]["question"] + "\nAnswer:"
+    if include_answer:
+        ret += " " + lines[i]["answer"]
+    return ret
+def get_few_shot_examples(lines: List[Dict], k: int) -> str:
+    """Get few-shot examples as a string."""
+    ret = ""
+    for i in range(k):
+        ret += get_one_example(lines, i, True) + "\n\n"
+    return ret
+def get_answer_value(answer_str: str) -> int:
+    """Extract numeric answer from model output."""
+    answer_str = answer_str.replace(",", "")
+    numbers = re.findall(r"\d+", answer_str)
+    if len(numbers) < 1:
+        return INVALID
+    try:
+        return ast.literal_eval(numbers[-1])
+    except SyntaxError:
+        return INVALID
+@BENCHMARKS.register("gsm8k")
+class GSM8KBenchmarker(Benchmarker):
+    """GSM8K benchmark implementation."""
+    def __init__(self, num_samples: Optional[int] = None):
+        super().__init__(num_samples, None)
+    def load_data(self) -> Tuple[List[Dict[str, Any]], List[int]]:
+        """Load and preprocess GSM8K dataset."""
+        # 优先从本地数据目录读取
+        local_path = "/workspace/hanrui/datasets/gsm8k/test.jsonl"
+        if os.path.exists(local_path):
+            print(f"Loading GSM8K data from local: {local_path}")
+            lines = list(read_jsonl(local_path))
+        else:
+            # 如果本地不存在，从网络下载
+            print(f"Local data not found, downloading from GitHub...")
+            url = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl"
+            data_path = download_and_cache_file(url)
+            lines = list(read_jsonl(data_path))
+        # Construct prompts
+        few_shot_examples = get_few_shot_examples(lines, 5)
+        questions = []
+        labels = []
+        for i in range((len(lines))):
+            if self.num_samples is not None and i >= self.num_samples:
+                break
+            question_text = get_one_example(lines, i, False)
+            questions.append({"question": question_text})
+            labels.append(get_answer_value(lines[i]["answer"]))
+        # Store few_shot_examples for use in create_sgl_function
+        self.few_shot_examples = few_shot_examples
+        assert all(l != INVALID for l in labels), "Some labels are invalid"
+        return questions, labels
+    def extract_answer(self, output: str, label: Optional[Any] = None) -> Optional[int]:
+        """Extract numeric answer from model output."""
+        return get_answer_value(output)
+    def compute_accuracy(
+        self, predictions: List[Any], labels: List[Any]
+    ) -> Optional[float]:
+        """Compute accuracy for GSM8K by comparing numeric answers."""
+        if not labels or len(labels) == 0:
+            return None
+        correct = sum(1 for pred, label in zip(predictions, labels) if pred == label)
+        return correct / len(labels) if len(labels) > 0 else 0.0
+    def create_sgl_function(self):
+        """Create SGL function for GSM8K with few-shot examples."""
+        return create_few_shot_sgl_function(
+            few_shot_examples=self.few_shot_examples,
+            function_name="few_shot_gsm8k",
+            answer_key="answer",
+            stop=["Question", "Assistant:", "<|separator|>"],
+        )

SpecForge-ext/benchmarks/benchmarker/humaneval.py ADDED Viewed

	@@ -0,0 +1,201 @@

+"""
+HumanEval benchmark evaluation script.
+"""
+import json
+import os
+import re
+from typing import Any, Dict, List, Optional, Tuple
+from datasets import load_dataset
+from .base import Benchmarker
+from .registry import BENCHMARKS
+from .utils import create_simple_sgl_function
+def extract_code_from_output(output: str) -> Optional[str]:
+    """Extract Python code from model output.
+    Tries to extract code blocks or function definitions.
+    """
+    # Try to find code in markdown code blocks
+    code_block_pattern = r"```(?:python)?\n(.*?)```"
+    match = re.search(code_block_pattern, output, re.DOTALL)
+    if match:
+        return match.group(1).strip()
+    # Try to find function definition (common in HumanEval)
+    # Look for "def " followed by code until the next def or end of string
+    def_pattern = r"(def\s+\w+\([^)]*\):.*?)(?=\n\ndef\s+|\Z)"
+    match = re.search(def_pattern, output, re.DOTALL)
+    if match:
+        return match.group(1).strip()
+    # Fallback: return the output as-is (might already be code)
+    return output.strip() if output.strip() else None
+def check_code_passes_tests(code: str, test_code: str, entry_point: str) -> bool:
+    """Check if generated code passes the test cases.
+    This is a simplified version. For full evaluation, use the official
+    HumanEval evaluation framework.
+    HumanEval test code typically contains assertions that will raise
+    AssertionError if the code doesn't pass. If execution completes without
+    exceptions, the tests pass.
+    """
+    try:
+        # Create a safe execution environment
+        namespace = {}
+        # Execute the code (function definition)
+        exec(code, namespace)
+        # Execute the test code (which contains assertions)
+        # If no exception is raised, the tests pass
+        exec(test_code, namespace)
+        return True
+    except AssertionError:
+        # Assertion failed - test didn't pass
+        return False
+    except Exception:
+        # Any other exception (syntax error, runtime error, etc.) means test failed
+        return False
+@BENCHMARKS.register("humaneval")
+class HumanEvalBenchmarker(Benchmarker):
+    """HumanEval benchmark implementation."""
+    def __init__(self, num_samples: Optional[int] = None):
+        """Initialize benchmark and store test cases."""
+        super().__init__(num_samples, None)
+        self.test_cases = []
+        self.entry_points = []
+    def load_data(self) -> Tuple[List[Dict[str, Any]], List[Optional[Dict[str, str]]]]:
+        """Load and preprocess HumanEval dataset."""
+        # 优先从本地数据目录读取
+        local_path = "/workspace/hanrui/datasets/humaneval/test.jsonl"
+        if os.path.exists(local_path):
+            print(f"Loading HumanEval data from local: {local_path}")
+            with open(local_path, 'r') as f:
+                dataset = [json.loads(line) for line in f]
+        else:
+            # 如果本地不存在，从 HuggingFace 下载
+            print(f"Local data not found, downloading from HuggingFace...")
+            dataset = load_dataset("openai/openai_humaneval")["test"]
+        questions = []
+        labels = []
+        self.test_cases = []
+        self.entry_points = []
+        for idx, q in enumerate(dataset):
+            if self.num_samples is not None and idx >= self.num_samples:
+                break
+            questions.append({"question": q["prompt"]})
+            # Store test case and entry point for evaluation
+            test_code = q.get("test", "")
+            entry_point = q.get("entry_point", "")
+            self.test_cases.append(test_code)
+            self.entry_points.append(entry_point)
+            # Store canonical solution as reference (optional, for comparison)
+            canonical_solution = q.get("canonical_solution", "")
+            labels.append(
+                {
+                    "test": test_code,
+                    "entry_point": entry_point,
+                    "canonical_solution": canonical_solution,
+                }
+            )
+        return questions, labels
+    def extract_answer(self, output: str, label: Optional[Any] = None) -> Optional[str]:
+        """Extract code from model output."""
+        return extract_code_from_output(output)
+    def compute_accuracy(
+        self, predictions: List[Any], labels: List[Any]
+    ) -> Optional[float]:
+        """Compute accuracy for HumanEval by checking if code passes tests.
+        Note: This is a simplified evaluation. For official pass@k metrics,
+        use the HumanEval evaluation framework.
+        """
+        if not labels or len(labels) == 0:
+            return None
+        if all(label is None for label in labels):
+            return None
+        correct = 0
+        valid_count = 0
+        for i, (pred, label) in enumerate(zip(predictions, labels)):
+            if label is not None and isinstance(label, dict):
+                valid_count += 1
+                if pred is not None:
+                    try:
+                        # Get the prompt (function signature and docstring)
+                        prompt = self.questions[i]["question"]
+                        entry_point = label.get("entry_point", "")
+                        # The prompt contains the function signature (e.g., "def function_name(...):")
+                        # The generated code might be:
+                        # 1. Just the function body (what we want) - need to combine with prompt
+                        # 2. The complete function including signature - use as-is
+                        # 3. Code in markdown blocks - already extracted by extract_code_from_output
+                        pred_str = str(pred).strip()
+                        # Check if pred already contains a complete function definition
+                        # (starts with "def " and contains the entry_point function name)
+                        if pred_str.startswith("def ") and entry_point:
+                            # Check if this is the same function (by name)
+                            func_name_match = re.match(r"def\s+(\w+)\s*\(", pred_str)
+                            if (
+                                func_name_match
+                                and func_name_match.group(1) == entry_point
+                            ):
+                                # Generated code includes complete function, use it as-is
+                                full_code = pred_str
+                            else:
+                                # Different function or no match, combine with prompt
+                                full_code = prompt + "\n" + pred_str
+                        elif pred_str.startswith("def "):
+                            # Has function definition but we can't verify entry_point, use as-is
+                            full_code = pred_str
+                        else:
+                            # Generated code is just the body, combine with prompt
+                            full_code = prompt + "\n" + pred_str
+                        # Check if code passes tests
+                        test_code = label.get("test", "")
+                        if test_code and check_code_passes_tests(
+                            full_code, test_code, entry_point
+                        ):
+                            correct += 1
+                    except Exception as e:
+                        # If evaluation fails, consider it incorrect
+                        # Uncomment for debugging: print(f"Error evaluating code {i}: {e}")
+                        pass
+        return correct / valid_count if valid_count > 0 else 0.0
+    def create_sgl_function(self):
+        """Create SGL function for HumanEval."""
+        return create_simple_sgl_function(
+            function_name="get_humaneval_answer",
+            answer_key="answer",
+            max_tokens=self.get_max_new_tokens(),
+        )
+    def get_max_new_tokens(self) -> int:
+        """HumanEval code generation requires more tokens."""
+        return 1024

SpecForge-ext/benchmarks/benchmarker/livecodebench.py ADDED Viewed

	@@ -0,0 +1,46 @@

+"""
+GSM8K benchmark evaluation script.
+"""
+from typing import Any, Dict, List, Optional, Tuple
+from datasets import load_dataset
+from .base import Benchmarker
+from .registry import BENCHMARKS
+from .utils import create_simple_sgl_function
+def generate_question(row: Dict[str, Any]) -> str:
+    question = row["question_content"].strip()
+    return question
+@BENCHMARKS.register("livecodebench")
+class LCBBenchmarker(Benchmarker):
+    """LiveCodeBench benchmark implementation."""
+    def __init__(self, num_samples: Optional[int] = None):
+        super().__init__(num_samples, None)
+    def load_data(self) -> Tuple[List[Dict[str, Any]], List[int]]:
+        # Read data
+        ds = load_dataset("livecodebench/code_generation")["test"]
+        questions = []
+        labels = []
+        for i in range((len(ds))):
+            if self.num_samples is not None and i >= self.num_samples:
+                break
+            question_text = generate_question(ds[i])
+            questions.append({"question": question_text})
+            labels.append(None)
+        return questions, labels
+    def create_sgl_function(self):
+        return create_simple_sgl_function(
+            function_name="get_livecodebench_answer",
+            answer_key="answer",
+            max_tokens=self.get_max_new_tokens(),
+        )

SpecForge-ext/benchmarks/benchmarker/math500.py ADDED Viewed

	@@ -0,0 +1,122 @@

+"""
+MATH-500 benchmark evaluation script.
+"""
+import re
+from typing import Any, Dict, List, Optional, Tuple
+from datasets import load_dataset
+from .base import Benchmarker
+from .registry import BENCHMARKS
+from .utils import create_simple_sgl_function
+def extract_math_answer(output: str) -> Optional[str]:
+    """Extract final answer from math problem solution.
+    Tries to extract answer from \boxed{} format first, then looks for
+    the last number in the output.
+    """
+    # Try to find answer in \boxed{} format
+    boxed_pattern = r"\\boxed\{([^}]+)\}"
+    match = re.search(boxed_pattern, output)
+    if match:
+        return match.group(1).strip()
+    # Try to find answer in \boxed format (without braces)
+    boxed_pattern2 = r"\\boxed\s+([^\s]+)"
+    match = re.search(boxed_pattern2, output)
+    if match:
+        return match.group(1).strip()
+    # Try to find the last number (could be integer or decimal)
+    # Look for patterns like "The answer is 42" or "Answer: 3.14"
+    answer_patterns = [
+        r"(?:answer|Answer|ANSWER)[\s:]+([-+]?\d*\.?\d+)",
+        r"(?:is|equals?|=\s*)([-+]?\d*\.?\d+)\s*$",
+    ]
+    for pattern in answer_patterns:
+        matches = re.findall(pattern, output, re.IGNORECASE)
+        if matches:
+            return matches[-1].strip()
+    # Fallback: extract the last number in the text
+    numbers = re.findall(r"[-+]?\d*\.?\d+", output)
+    if numbers:
+        return numbers[-1]
+    return None
+@BENCHMARKS.register("math500")
+class Math500Benchmarker(Benchmarker):
+    """MATH-500 benchmark implementation."""
+    def __init__(self, num_samples: Optional[int] = None):
+        super().__init__(num_samples, None)
+    def load_data(self) -> Tuple[List[Dict[str, Any]], List[Optional[str]]]:
+        """Load and preprocess MATH-500 dataset."""
+        dataset = load_dataset("HuggingFaceH4/MATH-500")["test"]
+        questions = []
+        labels = []
+        for idx, q in enumerate(dataset):
+            if self.num_samples is not None and idx >= self.num_samples:
+                break
+            questions.append({"question": q["problem"]})
+            # Extract answer from solution or answer field
+            answer = None
+            if "answer" in q:
+                answer = str(q["answer"]).strip()
+            elif "solution" in q:
+                # Try to extract from solution
+                answer = extract_math_answer(q["solution"])
+            labels.append(answer)
+        return questions, labels
+    def extract_answer(self, output: str, label: Optional[Any] = None) -> Optional[str]:
+        """Extract answer from model output."""
+        return extract_math_answer(output)
+    def compute_accuracy(
+        self, predictions: List[Any], labels: List[Any]
+    ) -> Optional[float]:
+        """Compute accuracy for MATH-500 by comparing answers."""
+        if not labels or len(labels) == 0:
+            return None
+        if all(label is None for label in labels):
+            return None
+        correct = 0
+        valid_count = 0
+        for pred, label in zip(predictions, labels):
+            if label is not None:
+                valid_count += 1
+                if pred is not None:
+                    # Normalize answers for comparison (remove whitespace, handle different formats)
+                    pred_normalized = str(pred).strip().lower()
+                    label_normalized = str(label).strip().lower()
+                    # Try exact match first
+                    if pred_normalized == label_normalized:
+                        correct += 1
+                    else:
+                        # Try numeric comparison if both are numbers
+                        try:
+                            pred_num = float(pred_normalized)
+                            label_num = float(label_normalized)
+                            if abs(pred_num - label_num) < 1e-6:
+                                correct += 1
+                        except ValueError:
+                            pass
+        return correct / valid_count if valid_count > 0 else 0.0
+    def create_sgl_function(self):
+        """Create SGL function for MATH-500."""
+        return create_simple_sgl_function(
+            function_name="get_math500_answer",
+            answer_key="answer",
+            max_tokens=self.get_max_new_tokens(),
+        )

SpecForge-ext/benchmarks/benchmarker/mmlu.py ADDED Viewed

	@@ -0,0 +1,82 @@

+from typing import Any, Dict, List, Optional, Tuple
+from datasets import load_dataset
+from .base import Benchmarker
+from .registry import BENCHMARKS
+from .utils import create_simple_sgl_function
+GPQA_QUERY_TEMPLATE = """
+Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.
+{Question}
+A) {A}
+B) {B}
+C) {C}
+D) {D}
+""".strip()
+def generate_question(row: Dict[str, Any]) -> str:
+    choices = row["choices"]
+    question = GPQA_QUERY_TEMPLATE.format(
+        Question=row["question"].strip(),
+        A=choices[0].strip(),
+        B=choices[1].strip(),
+        C=choices[2].strip(),
+        D=choices[3].strip(),
+    )
+    # 0 means A, 1 means B, 2 means C, 3 means D
+    answer = ["A", "B", "C", "D"][row["answer"]]
+    print(answer)
+    return question, answer
+@BENCHMARKS.register("mmlu")
+class MMLUBenchmarker(Benchmarker):
+    """MMLU benchmark implementation."""
+    def __init__(
+        self, num_samples: Optional[int] = None, subset: Optional[List[str]] = None
+    ):
+        if subset is None:
+            subset = ["all"]
+        super().__init__(num_samples, subset)
+    def load_data(self) -> Tuple[List[Dict[str, Any]], List[int]]:
+        # Read data
+        questions = []
+        labels = []
+        for subset in self.subset:
+            ds = load_dataset("cais/mmlu", subset)["test"]
+            for i in range((len(ds))):
+                if self.num_samples is not None and i >= self.num_samples:
+                    break
+                question_text, answer = generate_question(ds[i])
+                questions.append({"question": question_text})
+                labels.append(answer)
+        return questions, labels
+    def extract_answer(self, output: str, label: Optional[Any] = None) -> Optional[int]:
+        if "Answer: " not in output:
+            return None
+        return output.split("Answer: ")[1].strip()
+    def compute_accuracy(
+        self, predictions: List[Any], labels: List[Any]
+    ) -> Optional[float]:
+        if not labels or len(labels) == 0:
+            return None
+        correct = sum(1 for pred, label in zip(predictions, labels) if pred == label)
+        return correct / len(labels) if len(labels) > 0 else 0.0
+    def create_sgl_function(self):
+        return create_simple_sgl_function(
+            function_name="get_mmlu_answer",
+            answer_key="answer",
+            max_tokens=self.get_max_new_tokens(),
+        )

SpecForge-ext/benchmarks/benchmarker/mmstar.py ADDED Viewed

	@@ -0,0 +1,185 @@

+"""
+MMStar benchmark evaluation script.
+"""
+import os
+import re
+import shutil
+from typing import Any, Dict, List, Optional, Tuple
+from datasets import load_dataset
+from .base import Benchmarker
+from .registry import BENCHMARKS
+from .utils import create_image_sgl_function
+def extract_mmstar_answer(
+    output: str, options: Optional[List[str]] = None
+) -> Optional[str]:
+    """Extract answer from MMStar model output.
+    MMStar questions typically have multiple choice options (A, B, C, D, etc.)
+    """
+    output_upper = output.strip().upper()
+    # Try to find answer choice (A, B, C, D, etc.)
+    # Direct match for single letter
+    match = re.search(r"\b([A-Z])\b", output_upper)
+    if match:
+        letter = match.group(1)
+        if options and len(options) > 0:
+            # Validate that the letter is within valid range
+            max_option = chr(64 + len(options))  # 'A' + (len-1)
+            if "A" <= letter <= max_option:
+                return letter
+        else:
+            # Assume A-D are valid
+            if "A" <= letter <= "D":
+                return letter
+    # Try to find answer in parentheses or brackets
+    for pattern in [
+        r"\(([A-Z])\)",
+        r"\[([A-Z])\]",
+        r"答案[：:]\s*([A-Z])",
+        r"Answer[：:]\s*([A-Z])",
+        r"选择[：:]\s*([A-Z])",
+    ]:
+        match = re.search(pattern, output_upper)
+        if match:
+            letter = match.group(1)
+            if options and len(options) > 0:
+                max_option = chr(64 + len(options))
+                if "A" <= letter <= max_option:
+                    return letter
+            elif "A" <= letter <= "D":
+                return letter
+    return None
+@BENCHMARKS.register("mmstar")
+class MMStarBenchmarker(Benchmarker):
+    """MMStar benchmark implementation."""
+    def __init__(self, num_samples: Optional[int] = None):
+        super().__init__(num_samples, None)
+        """Initialize benchmark and set up cache directory."""
+        self.cache_dir = None
+        self.options_list = []  # Store options for each question
+    def load_data(self) -> Tuple[List[Dict[str, Any]], List[Optional[str]]]:
+        """Load and preprocess MMStar dataset."""
+        self.cache_dir = os.path.join(".cache", "mmstar_specforge")
+        image_dir = os.path.join(self.cache_dir, "images")
+        os.makedirs(self.cache_dir, exist_ok=True)
+        os.makedirs(image_dir, exist_ok=True)
+        print(f"Created temporary image directory: {self.cache_dir}")
+        dataset = load_dataset("Lin-Chen/MMStar")["val"]
+        questions = []
+        labels = []
+        self.options_list = []
+        for idx, q in enumerate(dataset):
+            if self.num_samples is not None and idx >= self.num_samples:
+                break
+            image = q["image"]
+            image_path = os.path.join(self.cache_dir, q["meta_info"]["image_path"])
+            image.convert("RGB").save(image_path, "JPEG")
+            # Extract question and options
+            question_full = q["question"]
+            if "Options:" in question_full:
+                question_text, options_text = question_full.split("Options:", 1)
+                question_text = question_text.strip()
+                # Parse options (typically A. option1 B. option2 etc.)
+                options = []
+                for line in options_text.strip().split("\n"):
+                    line = line.strip()
+                    if line and re.match(r"^[A-Z]\.", line):
+                        option_text = re.sub(r"^[A-Z]\.\s*", "", line).strip()
+                        options.append(option_text)
+                self.options_list.append(options)
+            else:
+                question_text = question_full.strip()
+                self.options_list.append([])
+            item = {
+                "image_path": image_path,
+                "question": question_text,
+            }
+            questions.append(item)
+            # Extract ground truth answer
+            answer = None
+            if "answer" in q:
+                answer = str(q["answer"]).strip().upper()
+            elif "correct_answer" in q:
+                answer = str(q["correct_answer"]).strip().upper()
+            elif "ground_truth" in q:
+                answer = str(q["ground_truth"]).strip().upper()
+            # Validate answer is a valid option letter
+            if answer and len(answer) == 1 and "A" <= answer <= "Z":
+                if self.options_list[-1]:
+                    max_option = chr(64 + len(self.options_list[-1]))
+                    if answer <= max_option:
+                        labels.append(answer)
+                    else:
+                        labels.append(None)
+                else:
+                    labels.append(answer)
+            else:
+                labels.append(None)
+        return questions, labels
+    def extract_answer(self, output: str, label: Optional[Any] = None) -> Optional[str]:
+        """Extract answer from model output."""
+        # Use the options for the current question if available
+        # Note: We can't easily get the question index here, so we'll use a simpler approach
+        return extract_mmstar_answer(output)
+    def compute_accuracy(
+        self, predictions: List[Any], labels: List[Any]
+    ) -> Optional[float]:
+        """Compute accuracy for MMStar by comparing answer choices."""
+        if not labels or len(labels) == 0:
+            return None
+        if all(label is None for label in labels):
+            return None
+        correct = 0
+        valid_count = 0
+        for pred, label in zip(predictions, labels):
+            if label is not None:
+                valid_count += 1
+                if pred is not None:
+                    # Normalize to uppercase for comparison
+                    pred_normalized = str(pred).strip().upper()
+                    label_normalized = str(label).strip().upper()
+                    if pred_normalized == label_normalized:
+                        correct += 1
+        return correct / valid_count if valid_count > 0 else 0.0
+    def create_sgl_function(self):
+        """Create SGL function for MMStar (image-based Q&A)."""
+        return create_image_sgl_function(
+            function_name="get_mmstar_answer",
+            answer_key="answer",
+            max_tokens=self.get_max_new_tokens(),
+        )
+    def run(self, *args, **kwargs):
+        """Run benchmark and clean up cache directory."""
+        try:
+            return super().run(*args, **kwargs)
+        finally:
+            # Clean up cache directory
+            if self.cache_dir and os.path.exists(self.cache_dir):
+                shutil.rmtree(self.cache_dir)
+                print(f"Deleted temporary directory: {self.cache_dir}")

SpecForge-ext/benchmarks/benchmarker/mtbench.py ADDED Viewed

	@@ -0,0 +1,70 @@

+"""
+MT-Bench benchmark evaluation script.
+Adapted from https://github.com/chromecast56/sglang/blob/6f145d2eadb93a116134f703358ce76f15381045/benchmark/mtbench/bench_sglang.py
+"""
+import os
+from typing import Any, Dict, List, Optional, Tuple
+from sglang.utils import download_and_cache_file, read_jsonl
+from .base import Benchmarker
+from .registry import BENCHMARKS
+from .utils import create_multi_turn_sgl_function
+SYSTEM_PROMPT = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."
+@BENCHMARKS.register("mtbench")
+class MTBenchBenchmarker(Benchmarker):
+    """MT-Bench benchmark implementation."""
+    def __init__(
+        self, num_samples: Optional[int] = None, subset: Optional[List[str]] = None
+    ):
+        # support categorical data for mtbench
+        if subset is None:
+            subset = ["all"]
+        super().__init__(num_samples, subset)
+    def load_data(self) -> Tuple[List[Dict[str, Any]], List[None]]:
+        """Load and preprocess MT-Bench dataset."""
+        # 优先从本地数据目录读取
+        local_path = "/workspace/hanrui/datasets/mtbench/question.jsonl"
+        if os.path.exists(local_path):
+            print(f"Loading MT-Bench data from local: {local_path}")
+            questions_data = list(read_jsonl(local_path))
+        else:
+            # 如果本地不存在，从网络下载
+            print(f"Local data not found, downloading from GitHub...")
+            url = "https://raw.githubusercontent.com/lm-sys/FastChat/main/fastchat/llm_judge/data/mt_bench/question.jsonl"
+            download_and_cache_file(url, filename="mtbench.jsonl")
+            questions_data = list(read_jsonl("mtbench.jsonl"))
+        questions_data = questions_data
+        questions = [
+            {"question_1": q["turns"][0], "question_2": q["turns"][1]}
+            for q in questions_data
+        ]
+        # MT-Bench doesn't have labels for accuracy computation
+        labels = [None] * len(questions)
+        if self.num_samples is not None:
+            questions = questions[: self.num_samples]
+            labels = labels[: self.num_samples]
+        return questions, labels
+    def create_sgl_function(self):
+        """Create SGL function for MT-Bench (2-turn conversation)."""
+        return create_multi_turn_sgl_function(
+            function_name="answer_mt_bench",
+            system_prompt=SYSTEM_PROMPT,
+            num_turns=2,
+            max_tokens=self.get_max_new_tokens(),
+        )
+    def get_answer_keys(self) -> List[str]:
+        """Return answer keys for multi-turn conversation."""
+        return ["answer_1", "answer_2"]