| |
|
|
| from typing import Any, Dict, List, Optional, Tuple, Union, Callable |
| import datetime |
| import uuid |
| import json |
| import os |
| import logging |
| from dataclasses import dataclass, field |
|
|
| from recursive_swe_bench.core.recursive_task import ( |
| RecursiveTask, Trajectory, TrajectoryStep, ProblemState, |
| EvaluationResult, Feedback, TaskStatus |
| ) |
|
|
| class RecursiveEvaluator: |
| """ |
| The core evaluation harness for recursive benchmark tasks. |
| |
| This class orchestrates the recursive evaluation process, managing the interactions |
| between models and tasks, tracking trajectories, and calculating metrics. |
| """ |
| |
| def __init__( |
| self, |
| model: Any, |
| metrics: Dict[str, Any], |
| config: Dict[str, Any] = None |
| ): |
| """ |
| Initialize the recursive evaluator. |
| |
| Args: |
| model: The model to evaluate |
| metrics: Dictionary of metric calculators |
| config: Configuration options |
| """ |
| self.model = model |
| self.metrics = metrics |
| self.config = config or {} |
| self.logger = self._setup_logger() |
| |
| def _setup_logger(self) -> logging.Logger: |
| """Set up logging for the evaluator.""" |
| logger = logging.getLogger("RecursiveEvaluator") |
| handler = logging.StreamHandler() |
| formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') |
| handler.setFormatter(formatter) |
| logger.addHandler(handler) |
| logger.setLevel(self.config.get("log_level", logging.INFO)) |
| return logger |
| |
| def evaluate_task( |
| self, |
| task: RecursiveTask, |
| max_iterations: int = 5 |
| ) -> Tuple[Trajectory, Dict[str, float]]: |
| """ |
| Run a full recursive evaluation on a single task. |
| |
| Args: |
| task: The task to evaluate |
| max_iterations: Maximum number of iterations |
| |
| Returns: |
| The trajectory and calculated metrics |
| """ |
| self.logger.info(f"Starting evaluation of task {task.task_id}") |
| |
| for i in range(max_iterations): |
| self.logger.info(f"Starting iteration {i+1}/{max_iterations}") |
| |
| |
| problem = task.get_current_problem() |
| self.logger.debug(f"Problem state: evolution_stage={problem['evolution_stage']}") |
| |
| |
| formatted_problem = self._format_problem_for_model(problem, task.trajectory) |
| |
| |
| self.logger.debug("Requesting solution from model") |
| solution = self.model.solve(formatted_problem) |
| |
| |
| self.logger.debug("Evaluating solution") |
| result, feedback = task.evaluate_solution(solution) |
| |
| |
| self.logger.info(f"Solution score: {result.score:.4f}, Success: {result.success}") |
| |
| |
| new_state = task.update_state(solution, result, feedback) |
| |
| |
| if task.status != TaskStatus.IN_PROGRESS: |
| self.logger.info(f"Task complete with status: {task.status.value}") |
| break |
| |
| |
| self.logger.info("Calculating metrics") |
| metrics_result = self._calculate_metrics(task.trajectory) |
| |
| return task.trajectory, metrics_result |
| |
| def evaluate_task_set( |
| self, |
| tasks: List[RecursiveTask], |
| max_iterations: int = 5, |
| output_dir: Optional[str] = None |
| ) -> Dict[str, Any]: |
| """ |
| Evaluate a set of tasks and aggregate the results. |
| |
| Args: |
| tasks: List of tasks to evaluate |
| max_iterations: Maximum iterations per task |
| output_dir: Directory to save results (optional) |
| |
| Returns: |
| Dictionary of aggregated results |
| """ |
| self.logger.info(f"Evaluating {len(tasks)} tasks") |
| |
| results = {} |
| trajectories = {} |
| all_metrics = {} |
| |
| for i, task in enumerate(tasks): |
| self.logger.info(f"Evaluating task {i+1}/{len(tasks)}: {task.task_id}") |
| |
| |
| trajectory, metrics = self.evaluate_task(task, max_iterations) |
| |
| |
| trajectories[task.task_id] = trajectory |
| all_metrics[task.task_id] = metrics |
| |
| |
| if output_dir: |
| os.makedirs(output_dir, exist_ok=True) |
| task_output_path = os.path.join(output_dir, f"task_{task.task_id}.json") |
| task.save(task_output_path) |
| self.logger.info(f"Saved task to {task_output_path}") |
| |
| |
| aggregated_metrics = self._aggregate_metrics(all_metrics) |
| |
| |
| results = { |
| "aggregated_metrics": aggregated_metrics, |
| "task_metrics": all_metrics, |
| "timestamp": datetime.datetime.now().isoformat(), |
| "model_info": self.model.get_meta_information(), |
| "total_tasks": len(tasks), |
| "config": self.config |
| } |
| |
| |
| if output_dir: |
| results_path = os.path.join(output_dir, "aggregated_results.json") |
| with open(results_path, "w") as f: |
| json.dump(results, f, indent=2) |
| self.logger.info(f"Saved aggregated results to {results_path}") |
| |
| return results |
| |
| def _format_problem_for_model( |
| self, |
| problem: Dict[str, Any], |
| trajectory: Trajectory |
| ) -> Dict[str, Any]: |
| """ |
| Format the problem in a way the model can understand. |
| |
| Args: |
| problem: The problem state |
| trajectory: The trajectory so far |
| |
| Returns: |
| Formatted problem for the model |
| """ |
| |
| previous_steps = [] |
| for step in trajectory.steps: |
| previous_steps.append({ |
| "problem": { |
| "description": step.problem_state.description, |
| "requirements": step.problem_state.requirements, |
| "evolution_stage": step.problem_state.evolution_stage |
| }, |
| "solution": step.solution, |
| "feedback": { |
| "summary": step.feedback.summary, |
| "issues": step.feedback.issues, |
| "suggestions": step.feedback.suggestions, |
| "focus_areas": step.feedback.focus_areas |
| } |
| }) |
| |
| |
| formatted_problem = { |
| "description": problem["description"], |
| "code_context": problem["code_context"], |
| "requirements": problem["requirements"], |
| "iteration": problem["evolution_stage"] + 1, |
| "previous_attempts": previous_steps |
| } |
| |
| return formatted_problem |
| |
| def _calculate_metrics(self, trajectory: Trajectory) -> Dict[str, float]: |
| """ |
| Calculate metrics across the trajectory. |
| |
| Args: |
| trajectory: The solution trajectory |
| |
| Returns: |
| Dictionary of metric values |
| """ |
| return {name: metric.calculate(trajectory) |
| for name, metric in self.metrics.items()} |
| |
| def _aggregate_metrics( |
| self, |
| all_metrics: Dict[str, Dict[str, float]] |
| ) -> Dict[str, float]: |
| """ |
| Aggregate metrics across multiple tasks. |
| |
| Args: |
| all_metrics: Dictionary of metrics per task |
| |
| Returns: |
| Dictionary of aggregated metrics |
| """ |
| |
| if not all_metrics: |
| return {} |
| |
| sample_metrics = next(iter(all_metrics.values())) |
| aggregated = {name: 0.0 for name in sample_metrics.keys()} |
| |
| |
| for task_metrics in all_metrics.values(): |
| for name, value in task_metrics.items(): |
| aggregated[name] += value |
| |
| |
| for name in aggregated: |
| aggregated[name] /= len(all_metrics) |
| |
| return aggregated |
|
|
|
|
| |
|
|
| from typing import Any, Dict, List, Optional |
| import numpy as np |
| from recursive_swe_bench.core.recursive_task import Trajectory |
|
|
|
|
| class RecursiveMetric: |
| """Base class for recursive metrics.""" |
| |
| def __init__(self, config: Dict[str, Any] = None): |
| self.config = config or {} |
| |
| def calculate(self, trajectory: Trajectory) -> float: |
| """ |
| Calculate the metric value for a trajectory. |
| |
| Args: |
| trajectory: The solution trajectory |
| |
| Returns: |
| The metric value |
| """ |
| raise NotImplementedError("Subclasses must implement this method") |
|
|
|
|
| class ConvergenceRate(RecursiveMetric): |
| """ |
| Measures how quickly the model reaches a stable solution. |
| |
| A lower value indicates faster convergence. |
| """ |
| |
| def calculate(self, trajectory: Trajectory) -> float: |
| scores = trajectory.get_score_series() |
| if len(scores) < 2: |
| return 0.0 |
| |
| |
| deltas = [abs(scores[i+1] - scores[i]) |
| for i in range(len(scores)-1)] |
| |
| |
| |
| return sum(deltas) / len(deltas) |
|
|
|
|
| class AdaptationEfficiency(RecursiveMetric): |
| """ |
| Measures improvement per feedback iteration. |
| |
| A higher value indicates more efficient adaptation. |
| """ |
| |
| def calculate(self, trajectory: Trajectory) -> float: |
| scores = trajectory.get_score_series() |
| if len(scores) < 2: |
| return 0.0 |
| |
| |
| total_improvement = max(0.0, scores[-1] - scores[0]) |
| |
| |
| return total_improvement / (len(scores) - 1) |
|
|
|
|
| class LearningCurveArea(RecursiveMetric): |
| """ |
| Measures the area under the learning curve. |
| |
| A higher value indicates better overall performance across iterations. |
| """ |
| |
| def calculate(self, trajectory: Trajectory) -> float: |
| scores = trajectory.get_score_series() |
| if not scores: |
| return 0.0 |
| |
| |
| |
| max_score = self.config.get("max_score", 1.0) |
| max_area = max_score * len(scores) |
| |
| return sum(scores) / max_area |
|
|
|
|
| class ProbabilisticSolutionQuality(RecursiveMetric): |
| """ |
| Measures the distribution of solution quality using non-deterministic assessment. |
| |
| This metric captures the robustness of solutions by measuring the variability in quality |
| across multiple probabilistic evaluations. |
| """ |
| |
| def calculate(self, trajectory: Trajectory) -> float: |
| |
| steps = trajectory.steps |
| if not steps: |
| return 0.0 |
| |
| |
| distributions = [] |
| for step in steps: |
| if (step.result.metrics and |
| "probabilistic_quality_distribution" in step.result.metrics): |
| distributions.append( |
| step.result.metrics["probabilistic_quality_distribution"]) |
| |
| if not distributions: |
| |
| return trajectory.get_score_series()[-1] |
| |
| |
| final_distribution = distributions[-1] |
| return sum(prob * val for val, prob in final_distribution.items()) |
|
|
|
|
| class TransferLearningFactor(RecursiveMetric): |
| """ |
| Measures how well learning transfers across related problems. |
| |
| This requires multiple trajectories from related tasks. |
| """ |
| |
| def __init__(self, config: Dict[str, Any] = None, related_trajectories: List[Trajectory] = None): |
| super().__init__(config) |
| self.related_trajectories = related_trajectories or [] |
| |
| def calculate(self, trajectory: Trajectory) -> float: |
| |
| if not self.related_trajectories: |
| return 0.0 |
| |
| |
| current_learning_rate = self._calculate_learning_rate(trajectory) |
| |
| related_learning_rates = [ |
| self._calculate_learning_rate(rel_traj) |
| for rel_traj in self.related_trajectories |
| ] |
| |
| |
| valid_related_rates = [rate for rate in related_learning_rates if rate is not None] |
| |
| if not valid_related_rates: |
| return 0.0 |
| |
| |
| |
| avg_related_rate = sum(valid_related_rates) / len(valid_related_rates) |
| |
| if avg_related_rate == 0: |
| return 0.0 |
| |
| return current_learning_rate / avg_related_rate |
| |
| def _calculate_learning_rate(self, trajectory: Trajectory) -> Optional[float]: |
| """Calculate the learning rate for a trajectory.""" |
| scores = trajectory.get_score_series() |
| if len(scores) < 2: |
| return None |
| |
| |
| return (scores[-1] - scores[0]) / (len(scores) - 1) |
|
|
|
|
| class DynamicComplexityHandling(RecursiveMetric): |
| """ |
| Measures how well the model handles varying problem complexity. |
| |
| This metric evaluates performance while accounting for changes in problem difficulty. |
| """ |
| |
| def calculate(self, trajectory: Trajectory) -> float: |
| if not trajectory.steps: |
| return 0.0 |
| |
| |
| scores = trajectory.get_score_series() |
| difficulties = [step.problem_state.difficulty for step in trajectory.steps] |
| |
| if len(scores) < 2: |
| return scores[0] |
| |
| |
| normalized_scores = [scores[i] * (1 + difficulties[i]) |
| for i in range(len(scores))] |
| |
| |
| return sum(normalized_scores) / len(normalized_scores) |
|
|