| import os |
| |
| from typing import Union, Any, List |
| from ..core.logging import logger |
| from .benchmark import CodingBenchmark |
| from ..core.module_utils import extract_code_blocks |
| from typing import Union, Any, List, Callable |
| from .lcb_utils.code_generation import ( |
| CodeGenerationProblem, |
| load_code_generation_dataset |
| ) |
| from .lcb_utils.test_output_prediction import ( |
| TestOutputPredictionProblem, |
| load_test_prediction_dataset |
| ) |
| from .lcb_utils.code_execution import ( |
| CodeExecutionProblem, |
| load_code_execution_dataset |
| ) |
| from .lcb_utils.evaluation import ( |
| codegen_metrics, |
| test_output_metrics, |
| code_execution_metrics, |
| reliability_guard |
| ) |
| from .lcb_utils.utils import extract_test_output_code, extract_execution_code |
|
|
|
|
| VALID_SCENARIO = ["code_generation", "test_output_prediction", "code_execution"] |
|
|
| class LiveCodeBench(CodingBenchmark): |
|
|
| """Benchmark class for evaluating LLM capabilities on real-world programming tasks. |
| |
| LiveCodeBench provides a framework for evaluating different scenarios of code-related tasks: |
| 1. Code Generation: generating code from problem descriptions |
| 2. Test Output Prediction: predicting test outputs given test code |
| 3. Code Execution: generating code that executes correctly |
| |
| The benchmark supports different evaluation modes, metrics, and can be customized |
| with various parameters like timeouts, sample dates, and processing options. |
| |
| Attributes: |
| k: An integer or list of integers specifying which pass@k metrics to compute |
| version: Release version of the dataset to use |
| num_process: Number of processes to use for evaluation |
| start_date: Filter problems to those after this date |
| end_date: Filter problems to those before this date |
| scenario: Type of programming task to evaluate ("code_generation", |
| "test_output_prediction", or "code_execution") |
| use_cot_for_execution: Whether to use chain-of-thought processing for code execution |
| """ |
|
|
| def __init__( |
| self, |
| path: str = None, |
| mode: str = "all", |
| timeout: int = 60, |
| k: Union[int, list] = 1, |
| num_process: int = 6, |
| scenario: str = "code_generation", |
| version: str = "release_latest", |
| start_date: str = None, |
| end_date: str = None, |
| use_cot_for_execution: bool = False, |
| **kwargs |
| ): |
| path = os.path.expanduser(path or "~/.evoagentx/data/livecodebench") |
| self.k = k |
| self.version = version |
| self.num_process = num_process |
| self.start_date = start_date |
| self.end_date = end_date |
| self.scenario = scenario |
| self.name = 'livecodebench' |
| self.use_cot_for_execution = use_cot_for_execution |
| assert scenario in VALID_SCENARIO, f"Invalid scenario: {scenario}. Available choices: {VALID_SCENARIO}." |
| super().__init__(name=type(self).__name__, path=path, mode=mode, timeout=timeout, **kwargs) |
| |
| def _load_data(self): |
| if self.mode == "train" or self.mode == "all": |
| self._train_data = None |
| if self.mode == "dev" or self.mode == "all": |
| self._dev_data = None |
| if self.mode == "test" or self.mode == "all": |
| self._test_data = self._load_test_data() |
| |
| def _load_test_data(self): |
|
|
| if self.scenario == "code_generation": |
| logger.info(f"Loading code generation dataset from {self.path} with version {self.version}.") |
| data: List[CodeGenerationProblem] = load_code_generation_dataset( |
| release_version=self.version, |
| cache_dir=self.path, |
| start_date=self.start_date, |
| end_date=self.end_date |
| ) |
| elif self.scenario == "test_output_prediction": |
| logger.info(f"Loading test output prediction dataset from {self.path}.") |
| data: List[TestOutputPredictionProblem] = load_test_prediction_dataset(cache_dir=self.path) |
| elif self.scenario == "code_execution": |
| logger.info(f"Loading code execution dataset from {self.path}.") |
| data: List[CodeExecutionProblem] = load_code_execution_dataset(cache_dir=self.path) |
| else: |
| raise ValueError(f"Invalid scenario: {self.scenario}. Available choices: {VALID_SCENARIO}.") |
|
|
| return data |
| |
| def _get_id(self, example: Union[CodeGenerationProblem, TestOutputPredictionProblem]) -> str: |
| return example.question_id |
| |
| def _get_label(self, example: Union[CodeGenerationProblem, TestOutputPredictionProblem]) -> dict: |
| return example.get_evaluation_sample() |
| |
| async def async_evaluate(self, graph: Callable, example: Any) -> float: |
|
|
| |
| prompt, entry_point = example.question_content, example.starter_code |
| solution = await graph(prompt, entry_point) |
| label = self._get_label(example) |
| metrics = await super().async_evaluate(prediction=solution, label=label) |
| return metrics["pass@1"] |
| |
| |
| def evaluate(self, prediction: Any, label: Any) -> dict: |
| """ |
| Evaluate the solution code. |
| |
| Args: |
| prediction (str | List[str]): The solution code(s). |
| label (dict | List[dict]): The test cases and expected outputs. |
| |
| Returns: |
| dict: The evaluation metrics (pass@k). |
| """ |
| |
| |
| |
| |
| prediction, label = self._check_evaluation_inputs(prediction, label) |
| k_list = [self.k] if isinstance(self.k, int) else self.k |
|
|
| if self.scenario == "code_generation": |
| solutions: List[str] = [extract_code_blocks(pred)[0] for pred in prediction] |
| |
| |
| metrics, results, metadatas = codegen_metrics( |
| samples_list=label, |
| generations_list=[solutions], |
| k_list=k_list, |
| num_process_evaluate=self.num_process, |
| timeout=self.timeout |
| ) |
| |
| self.met = metrics |
| self.res = results |
| self.metadatas = metadatas |
| |
| elif self.scenario == "test_output_prediction": |
| pred_outputs = [extract_test_output_code(pred) for pred in prediction] |
| metrics, results = test_output_metrics( |
| samples=label, |
| generations=[pred_outputs], |
| k_list=k_list, |
| ) |
| elif self.scenario == "code_execution": |
| pred_outputs = [extract_execution_code(pred, self.use_cot_for_execution) for pred in prediction] |
| metrics, results = code_execution_metrics( |
| samples=label, |
| generations=[pred_outputs], |
| ) |
| else: |
| raise ValueError(f"Invalid scenario: {self.scenario}. Available choices: {VALID_SCENARIO}.") |
| |
| pass_at_k = {f"pass@{k}": float(metrics[f"pass@{k}"]) for k in k_list} |
| return pass_at_k |
| |
| class AFlowLiveCodeBench(CodingBenchmark): |
|
|
| """Benchmark class for evaluating LLM capabilities on real-world programming tasks. |
| |
| LiveCodeBench provides a framework for evaluating different scenarios of code-related tasks: |
| 1. Code Generation: generating code from problem descriptions |
| 2. Test Output Prediction: predicting test outputs given test code |
| 3. Code Execution: generating code that executes correctly |
| |
| The benchmark supports different evaluation modes, metrics, and can be customized |
| with various parameters like timeouts, sample dates, and processing options. |
| |
| Attributes: |
| k: An integer or list of integers specifying which pass@k metrics to compute |
| version: Release version of the dataset to use |
| num_process: Number of processes to use for evaluation |
| start_date: Filter problems to those after this date |
| end_date: Filter problems to those before this date |
| scenario: Type of programming task to evaluate ("code_generation", |
| "test_output_prediction", or "code_execution") |
| use_cot_for_execution: Whether to use chain-of-thought processing for code execution |
| """ |
|
|
| def __init__( |
| self, |
| path: str = None, |
| mode: str = "all", |
| timeout: int = 60, |
| k: Union[int, list] = 1, |
| num_process: int = 6, |
| scenario: str = "code_generation", |
| version: str = "release_latest", |
| start_date: str = None, |
| end_date: str = None, |
| use_cot_for_execution: bool = False, |
| **kwargs |
| ): |
| path = os.path.expanduser(path or "~/.evoagentx/data/livecodebench") |
| self.k = k |
| self.version = version |
| self.num_process = num_process |
| self.start_date = start_date |
| self.end_date = end_date |
| self.scenario = scenario |
| self.use_cot_for_execution = use_cot_for_execution |
| assert scenario in VALID_SCENARIO, f"Invalid scenario: {scenario}. Available choices: {VALID_SCENARIO}." |
| super().__init__(name=type(self).__name__, path=path, mode=mode, timeout=timeout, **kwargs) |
| |
| def _load_data(self): |
| if self.mode == "train" or self.mode == "all": |
| self._train_data = None |
| if self.mode == "dev" or self.mode == "all": |
| self._dev_data = None |
| if self.mode == "test" or self.mode == "all": |
| self._test_data = self._load_test_data() |
| |
| def _load_test_data(self): |
|
|
| if self.scenario == "code_generation": |
| logger.info(f"Loading code generation dataset from {self.path} with version {self.version}.") |
| data: List[CodeGenerationProblem] = load_code_generation_dataset( |
| release_version=self.version, |
| cache_dir=self.path, |
| start_date=self.start_date, |
| end_date=self.end_date |
| ) |
| elif self.scenario == "test_output_prediction": |
| logger.info(f"Loading test output prediction dataset from {self.path}.") |
| data: List[TestOutputPredictionProblem] = load_test_prediction_dataset(cache_dir=self.path) |
| elif self.scenario == "code_execution": |
| logger.info(f"Loading code execution dataset from {self.path}.") |
| data: List[CodeExecutionProblem] = load_code_execution_dataset(cache_dir=self.path) |
| else: |
| raise ValueError(f"Invalid scenario: {self.scenario}. Available choices: {VALID_SCENARIO}.") |
|
|
| return data |
| |
| def _get_id(self, example: Union[CodeGenerationProblem, TestOutputPredictionProblem]) -> str: |
| return example.question_id |
| |
| def _get_label(self, example: Union[CodeGenerationProblem, TestOutputPredictionProblem]) -> dict: |
| return example.get_evaluation_sample() |
| |
| async def async_evaluate(self, graph: Callable, example: Any) -> float: |
|
|
| |
| prompt, entry_point = example.question_content, example.question_title |
| solution = await graph(prompt, entry_point) |
| label = self._get_label(example) |
| metrics = await super().async_evaluate(prediction=solution, label=label) |
| return metrics["pass@1"] |
| |
| def extract_test_cases_with_entry_point(self, entry_point: str): |
|
|
| hardcoded_cases = { |
| "remove_odd": "", |
| "replace_spaces": "", |
| "snake_to_camel": "", |
| "Split": "", |
| "swap_List": "", |
| "square_Sum": "", |
| "sort_sublists": "", |
| "unique_sublists": "", |
| } |
| if entry_point in hardcoded_cases: |
| return hardcoded_cases[entry_point] |
| |
| for case in self._dev_data + self._test_data: |
| print(entry_point) |
| print(case.question_title) |
| if case.question_title == entry_point: |
| return case.private_test_cases |
| |
| return None |
| |
| |
| def evaluate(self, prediction: Any, label: Any) -> dict: |
| """ |
| Evaluate the solution code. |
| |
| Args: |
| prediction (str | List[str]): The solution code(s). |
| label (dict | List[dict]): The test cases and expected outputs. |
| |
| Returns: |
| dict: The evaluation metrics (pass@k). |
| """ |
| |
| |
| |
| |
| prediction, label = self._check_evaluation_inputs(prediction, label) |
| k_list = [self.k] if isinstance(self.k, int) else self.k |
|
|
| if self.scenario == "code_generation": |
| solutions: List[str] = [extract_code_blocks(pred)[0] for pred in prediction] |
| |
| |
| metrics, results, metadatas = codegen_metrics( |
| samples_list=label, |
| generations_list=[solutions], |
| k_list=k_list, |
| num_process_evaluate=self.num_process, |
| timeout=self.timeout |
| ) |
| |
| elif self.scenario == "test_output_prediction": |
| pred_outputs = [extract_test_output_code(pred) for pred in prediction] |
| metrics, results = test_output_metrics( |
| samples=label, |
| generations=[pred_outputs], |
| k_list=k_list, |
| ) |
| elif self.scenario == "code_execution": |
| pred_outputs = [extract_execution_code(pred, self.use_cot_for_execution) for pred in prediction] |
| metrics, results = code_execution_metrics( |
| samples=label, |
| generations=[pred_outputs], |
| ) |
| else: |
| raise ValueError(f"Invalid scenario: {self.scenario}. Available choices: {VALID_SCENARIO}.") |
| |
| pass_at_k = {f"pass@{k}": float(metrics[f"pass@{k}"]) for k in k_list} |
| return pass_at_k |