| |
|
|
| from typing import Any, Dict, List, Optional, Tuple, Set, Union |
| import uuid |
| import json |
| import re |
| import random |
| import ast |
| import copy |
| from pathlib import Path |
| import tempfile |
| import subprocess |
| import shutil |
| import os |
|
|
| from recursive_swe_bench.core.recursive_task import ( |
| RecursiveTask, ProblemState, EvaluationResult, Feedback, TaskStatus |
| ) |
|
|
| class BugCategory: |
| """Categories of bugs for classification and evolution.""" |
| SYNTAX = "syntax" |
| LOGICAL = "logical" |
| PERFORMANCE = "performance" |
| SECURITY = "security" |
| CONCURRENCY = "concurrency" |
| EXCEPTION_HANDLING = "exception_handling" |
| API_USAGE = "api_usage" |
| MEMORY_MANAGEMENT = "memory_management" |
| TYPE_ERROR = "type_error" |
| EDGE_CASE = "edge_case" |
| DATA_HANDLING = "data_handling" |
| DEPENDENCY = "dependency" |
|
|
|
|
| class BugFixingTask(RecursiveTask): |
| """ |
| A recursive task for evaluating how models fix bugs in code. |
| |
| The task presents a piece of code with one or more bugs, and evolves |
| based on the model's fix attempts. As the model addresses issues, |
| the task may introduce more subtle bugs, change requirements, or |
| increase complexity to test adaptive problem-solving. |
| """ |
| |
| def __init__( |
| self, |
| initial_state: ProblemState, |
| config: Dict[str, Any] = None, |
| test_runner: Any = None |
| ): |
| """ |
| Initialize the bug fixing task. |
| |
| Args: |
| initial_state: The initial problem state |
| config: Configuration options |
| test_runner: Custom test runner (optional) |
| """ |
| super().__init__(initial_state, config) |
| self.test_runner = test_runner or DefaultTestRunner() |
| self.bug_categories: Set[str] = set( |
| self.config.get("bug_categories", [BugCategory.LOGICAL, BugCategory.SYNTAX]) |
| ) |
| self.difficulty_progression = self.config.get( |
| "difficulty_progression", [0.0, 0.15, 0.3, 0.5, 0.7] |
| ) |
| self.evolution_strategies = self.config.get( |
| "evolution_strategies", ["add_subtle_bug", "change_requirements", "increase_complexity"] |
| ) |
| |
| def _run_evaluation(self, solution: str) -> EvaluationResult: |
| """ |
| Run tests to evaluate the solution. |
| |
| Args: |
| solution: The solution code |
| |
| Returns: |
| Evaluation results |
| """ |
| |
| with tempfile.TemporaryDirectory() as temp_dir: |
| temp_path = Path(temp_dir) |
| |
| |
| solution_file = temp_path / "solution.py" |
| with open(solution_file, "w") as f: |
| f.write(solution) |
| |
| |
| test_files = self._create_test_files(temp_path) |
| |
| |
| results = self.test_runner.run_tests( |
| solution_file=solution_file, |
| test_files=test_files, |
| code_context=self.state.code_context |
| ) |
| |
| |
| score = self._calculate_score(results) |
| |
| return EvaluationResult( |
| success=results["all_passed"], |
| score=score, |
| execution_results=results["execution"], |
| error_details=results.get("errors"), |
| test_results=results["tests"], |
| metrics={ |
| "passed_tests": results["passed_tests"], |
| "total_tests": results["total_tests"], |
| "execution_time": results["execution_time"], |
| "memory_usage": results.get("memory_usage", 0), |
| "code_complexity": self._calculate_complexity(solution) |
| } |
| ) |
| |
| def _generate_feedback(self, solution: str, result: EvaluationResult) -> Feedback: |
| """ |
| Generate structured feedback based on evaluation results. |
| |
| Args: |
| solution: The solution code |
| result: The evaluation results |
| |
| Returns: |
| Structured feedback |
| """ |
| issues = [] |
| suggestions = [] |
| focus_areas = [] |
| |
| |
| if result.test_results: |
| for test_name, test_result in result.test_results.items(): |
| if not test_result["passed"]: |
| issues.append({ |
| "type": "test_failure", |
| "test": test_name, |
| "message": test_result.get("message", "Test failed"), |
| "expected": test_result.get("expected"), |
| "actual": test_result.get("actual") |
| }) |
| |
| |
| if result.error_details: |
| for error_type, error_info in result.error_details.items(): |
| issues.append({ |
| "type": "error", |
| "error_type": error_type, |
| "message": error_info.get("message", "An error occurred"), |
| "location": error_info.get("location") |
| }) |
| |
| |
| for issue in issues: |
| if issue["type"] == "test_failure": |
| suggestion = self._generate_suggestion_for_test_failure( |
| issue, solution, result.test_results |
| ) |
| if suggestion: |
| suggestions.append(suggestion) |
| elif issue["type"] == "error": |
| suggestion = self._generate_suggestion_for_error( |
| issue, solution |
| ) |
| if suggestion: |
| suggestions.append(suggestion) |
| |
| |
| focus_areas = self._determine_focus_areas(issues, solution, result) |
| |
| |
| adaptation_hints = self._generate_adaptation_hints(solution, result) |
| |
| |
| if result.success: |
| summary = ( |
| f"Your solution passes all tests with a score of {result.score:.2f}. " |
| f"The code successfully addresses the bugs in the original implementation." |
| ) |
| else: |
| passed = result.metrics.get("passed_tests", 0) |
| total = result.metrics.get("total_tests", 0) |
| summary = ( |
| f"Your solution passes {passed}/{total} tests with a score of {result.score:.2f}. " |
| f"There are still issues that need to be addressed." |
| ) |
| |
| return Feedback( |
| summary=summary, |
| issues=issues, |
| suggestions=suggestions, |
| focus_areas=focus_areas, |
| adaptation_hints=adaptation_hints |
| ) |
| |
| def _evolve_state(self, solution: str, result: EvaluationResult, feedback: Feedback) -> ProblemState: |
| """ |
| Evolve the problem state based on the solution and feedback. |
| |
| This method implements the recursive nature of the benchmark by |
| adapting the problem to challenge the model's understanding. |
| |
| Args: |
| solution: The attempted solution |
| result: The evaluation results |
| feedback: The feedback provided |
| |
| Returns: |
| The evolved problem state |
| """ |
| |
| if result.success and result.score > 0.95: |
| return self._increase_difficulty(solution, result, feedback) |
| |
| |
| elif result.score > 0.7: |
| return self._focus_remaining_issues(solution, result, feedback) |
| |
| |
| else: |
| return self._provide_more_guidance(solution, result, feedback) |
| |
| def _increase_difficulty(self, solution: str, result: EvaluationResult, feedback: Feedback) -> ProblemState: |
| """ |
| Increase the difficulty of the problem for models that solved it well. |
| |
| Args: |
| solution: The successful solution |
| result: The evaluation results |
| feedback: The feedback provided |
| |
| Returns: |
| The evolved problem state with increased difficulty |
| """ |
| |
| new_state = copy.deepcopy(self.state) |
| |
| |
| new_state.evolution_stage += 1 |
| |
| |
| current_difficulty_idx = min(new_state.evolution_stage, |
| len(self.difficulty_progression) - 1) |
| new_state.difficulty = self.difficulty_progression[current_difficulty_idx] |
| |
| |
| strategy = self._select_evolution_strategy(solution, result, feedback) |
| |
| |
| if strategy == "add_subtle_bug": |
| self._add_subtle_bug(new_state, solution) |
| elif strategy == "change_requirements": |
| self._change_requirements(new_state, solution) |
| elif strategy == "increase_complexity": |
| self._increase_complexity(new_state, solution) |
| |
| |
| new_state.description = self._generate_description(new_state) |
| |
| |
| new_state.adaptation_vector = self._calculate_adaptation_vector( |
| solution, result, feedback |
| ) |
| |
| return new_state |
| |
| def _focus_remaining_issues(self, solution: str, result: EvaluationResult, feedback: Feedback) -> ProblemState: |
| """ |
| Evolve the state to focus on remaining issues when the solution is close but not perfect. |
| |
| Args: |
| solution: The nearly-successful solution |
| result: The evaluation results |
| feedback: The feedback provided |
| |
| Returns: |
| The evolved problem state focusing on remaining issues |
| """ |
| |
| new_state = copy.deepcopy(self.state) |
| |
| |
| new_state.evolution_stage += 1 |
| |
| |
| current_difficulty_idx = min(new_state.evolution_stage - 1, |
| len(self.difficulty_progression) - 1) |
| new_state.difficulty = self.difficulty_progression[current_difficulty_idx] |
| |
| |
| new_state.code_context["focus_areas"] = feedback.focus_areas |
| |
| |
| if result.test_results: |
| failing_tests = [ |
| test_name for test_name, test_result in result.test_results.items() |
| if not test_result["passed"] |
| ] |
| new_state.code_context["failing_tests"] = failing_tests |
| |
| |
| new_state.description = self._generate_focused_description( |
| new_state, feedback.issues |
| ) |
| |
| |
| new_state.adaptation_vector = self._calculate_adaptation_vector( |
| solution, result, feedback |
| ) |
| |
| return new_state |
| |
| def _provide_more_guidance(self, solution: str, result: EvaluationResult, feedback: Feedback) -> ProblemState: |
| """ |
| Evolve the state to provide more guidance when the solution was not very good. |
| |
| Args: |
| solution: The unsuccessful solution |
| result: The evaluation results |
| feedback: The feedback provided |
| |
| Returns: |
| The evolved problem state with more guidance |
| """ |
| |
| new_state = copy.deepcopy(self.state) |
| |
| |
| new_state.evolution_stage += 1 |
| |
| |
| current_difficulty_idx = max(0, min(new_state.evolution_stage - 1, |
| len(self.difficulty_progression) - 1) - 1) |
| new_state.difficulty = self.difficulty_progression[current_difficulty_idx] |
| |
| |
| new_state.code_context["hints"] = self._generate_hints( |
| solution, result, feedback |
| ) |
| |
| |
| if result.test_results: |
| detailed_test_results = {} |
| for test_name, test_result in result.test_results.items(): |
| if not test_result["passed"]: |
| detailed_test_results[test_name] = { |
| "message": test_result.get("message", "Test failed"), |
| "expected": test_result.get("expected"), |
| "actual": test_result.get("actual"), |
| "hint": self._generate_test_hint(test_name, test_result) |
| } |
| new_state.code_context["detailed_test_results"] = detailed_test_results |
| |
| |
| new_state.description = self._generate_guided_description( |
| new_state, feedback.issues, feedback.suggestions |
| ) |
| |
| |
| new_state.adaptation_vector = self._calculate_adaptation_vector( |
| solution, result, feedback |
| ) |
| |
| return new_state |
| |
| def _select_evolution_strategy(self, solution: str, result: EvaluationResult, feedback: Feedback) -> str: |
| """ |
| Select an evolution strategy based on the current state and solution. |
| |
| Args: |
| solution: The current solution |
| result: The evaluation results |
| feedback: The feedback provided |
| |
| Returns: |
| The selected evolution strategy |
| """ |
| available_strategies = self.evolution_strategies.copy() |
| |
| |
| weights = {} |
| |
| |
| if result.score > 0.95: |
| weights["add_subtle_bug"] = 0.6 |
| weights["change_requirements"] = 0.3 |
| weights["increase_complexity"] = 0.1 |
| |
| |
| elif self.state.evolution_stage >= 2 and "bug_count" in self.state.code_context and self.state.code_context["bug_count"] >= 3: |
| weights["add_subtle_bug"] = 0.1 |
| weights["change_requirements"] = 0.7 |
| weights["increase_complexity"] = 0.2 |
| |
| |
| elif result.score > 0.85: |
| weights["add_subtle_bug"] = 0.2 |
| weights["change_requirements"] = 0.2 |
| weights["increase_complexity"] = 0.6 |
| |
| |
| else: |
| weights = {strategy: 1.0 / len(available_strategies) |
| for strategy in available_strategies} |
| |
| |
| total_weight = sum(weights.get(strategy, 0) for strategy in available_strategies) |
| normalized_weights = [weights.get(strategy, 0) / total_weight |
| for strategy in available_strategies] |
| |
| |
| return random.choices(available_strategies, weights=normalized_weights)[0] |
| |
| def _add_subtle_bug(self, state: ProblemState, solution: str) -> None: |
| """ |
| Add a subtle bug to the solution code. |
| |
| Args: |
| state: The problem state to modify |
| solution: The current solution |
| """ |
| |
| try: |
| parsed_solution = ast.parse(solution) |
| except SyntaxError: |
| |
| self._add_syntax_error(state, solution) |
| return |
| |
| |
| available_categories = list(self.bug_categories) |
| if available_categories: |
| bug_category = random.choice(available_categories) |
| else: |
| bug_category = BugCategory.LOGICAL |
| |
| |
| if bug_category == BugCategory.SYNTAX: |
| self._add_syntax_error(state, solution) |
| elif bug_category == BugCategory.LOGICAL: |
| self._add_logical_error(state, solution, parsed_solution) |
| elif bug_category == BugCategory.PERFORMANCE: |
| self._add_performance_issue(state, solution, parsed_solution) |
| elif bug_category == BugCategory.EDGE_CASE: |
| self._add_edge_case_issue(state, solution, parsed_solution) |
| else: |
| |
| self._add_logical_error(state, solution, parsed_solution) |
| |
| |
| if "bug_count" not in state.code_context: |
| state.code_context["bug_count"] = 0 |
| state.code_context["bug_count"] += 1 |
| |
| |
| if "bug_categories" not in state.code_context: |
| state.code_context["bug_categories"] = [] |
| state.code_context["bug_categories"].append(bug_category) |
| |
| def _change_requirements(self, state: ProblemState, solution: str) -> None: |
| """ |
| Change the requirements to challenge the current solution. |
| |
| Args: |
| state: The problem state to modify |
| solution: The current solution |
| """ |
| |
| requirements = state.requirements |
| |
| |
| new_requirement = self._generate_new_requirement(state, solution) |
| if new_requirement: |
| requirements.append(new_requirement) |
| |
| |
| if requirements and random.random() < 0.5: |
| idx = random.randint(0, len(requirements) - 1) |
| requirements[idx] = self._modify_requirement(requirements[idx], state, solution) |
| |
| def _increase_complexity(self, state: ProblemState, solution: str) -> None: |
| """ |
| Increase the complexity of the task. |
| |
| Args: |
| state: The problem state to modify |
| solution: The current solution |
| """ |
| |
| try: |
| parsed_solution = ast.parse(solution) |
| except SyntaxError: |
| |
| self._add_edge_case_requirement(state) |
| return |
| |
| |
| strategies = [ |
| "add_edge_cases", |
| "increase_data_volume", |
| "add_performance_constraint", |
| "expand_functionality" |
| ] |
| |
| strategy = random.choice(strategies) |
| |
| if strategy == "add_edge_cases": |
| self._add_edge_case_requirement(state) |
| elif strategy == "increase_data_volume": |
| self._increase_data_volume(state, solution) |
| elif strategy == "add_performance_constraint": |
| self._add_performance_constraint(state, solution) |
| elif strategy == "expand_functionality": |
| self._expand_functionality(state, solution) |
| |
| def _create_test_files(self, temp_path: Path) -> List[Path]: |
| """ |
| Create test files based on the current problem state. |
| |
| Args: |
| temp_path: The temporary directory path |
| |
| Returns: |
| List of test file paths |
| """ |
| test_files = [] |
| |
| |
| if "tests" in self.state.code_context: |
| for i, test in enumerate(self.state.code_context["tests"]): |
| test_file = temp_path / f"test_{i}.py" |
| with open(test_file, "w") as f: |
| f.write(test["content"]) |
| test_files.append(test_file) |
| |
| |
| if not test_files: |
| test_file = temp_path / "test_default.py" |
| with open(test_file, "w") as f: |
| f.write(self._generate_default_test()) |
| test_files.append(test_file) |
| |
| return test_files |
| |
| def _calculate_score(self, results: Dict[str, Any]) -> float: |
| """ |
| Calculate a score based on test results. |
| |
| Args: |
| results: The test results |
| |
| Returns: |
| A score between 0 and 1 |
| """ |
| |
| if results["total_tests"] == 0: |
| test_score = 0.0 |
| else: |
| test_score = results["passed_tests"] / results["total_tests"] |
| |
| |
| execution_score = 1.0 if results["execution"]["success"] else 0.0 |
| |
| |
| weights = self.config.get("score_weights", {"test": 0.7, "execution": 0.3}) |
| score = (test_score * weights["test"] + execution_score * weights["execution"]) |
| |
| |
| difficulty_modifier = 1.0 + (self.state.difficulty * 0.2) |
| score = score / difficulty_modifier |
| |
| return max(0.0, min(1.0, score)) |
| |
| def _calculate_complexity(self, code: str) -> float: |
| """ |
| Calculate the complexity of code. |
| |
| Args: |
| code: The code to analyze |
| |
| Returns: |
| A complexity score |
| """ |
| |
| complexity = 1 |
| |
| |
| for pattern in ["if", "for", "while", "and", "or"]: |
| complexity += code.count(f" {pattern} ") |
| |
| |
| complexity += code.count("def ") |
| |
| |
| normalized = min(1.0, complexity / 50.0) |
| |
| return normalized |
| |
| def _generate_suggestion_for_test_failure( |
| self, |
| issue: Dict[str, Any], |
| solution: str, |
| test_results: Dict[str, Any] |
| ) -> Dict[str, Any]: |
| """ |
| Generate a suggestion for a test failure. |
| |
| Args: |
| issue: The issue data |
| solution: The solution code |
| test_results: The test results |
| |
| Returns: |
| A suggestion dictionary |
| """ |
| test_name = issue["test"] |
| test_result = test_results[test_name] |
| |
| |
| test_content = None |
| for test in self.state.code_context.get("tests", []): |
| if test.get("name") == test_name: |
| test_content = test.get("content") |
| break |
| |
| if test_content: |
| |
| assertion_match = re.search(r"assert.*", test_content) |
| assertion = assertion_match.group(0) if assertion_match else None |
| |
| |
| test_funcs = re.findall(r"def\s+(\w+)", test_content) |
| solution_funcs = re.findall(r"def\s+(\w+)", solution) |
| |
| |
| missing_funcs = [f for f in test_funcs if f not in solution_funcs] |
| |
| if missing_funcs: |
| return { |
| "type": "missing_function", |
| "message": f"Implement the missing function(s): {', '.join(missing_funcs)}", |
| "functions": missing_funcs |
| } |
| elif assertion: |
| return { |
| "type": "fix_assertion_failure", |
| "message": f"Fix the code to pass the assertion: {assertion}", |
| "assertion": assertion, |
| "expected": test_result.get("expected"), |
| "actual": test_result.get("actual") |
| } |
| else: |
| return { |
| "type": "fix_test_failure", |
| "message": f"Fix the code to pass the test: {test_name}", |
| "test_name": test_name |
| } |
| else: |
| return { |
| "type": "general_fix", |
| "message": f"Fix the code to pass the failing test: {test_name}" |
| } |
| |
| def _generate_suggestion_for_error( |
| self, |
| issue: Dict[str, Any], |
| solution: str |
| ) -> Dict[str, Any]: |
| """ |
| Generate a suggestion for an error. |
| |
| Args: |
| issue: The issue data |
| solution: The solution code |
| |
| Returns: |
| A suggestion dictionary |
| """ |
| error_type = issue["error_type"] |
| message = issue["message"] |
| location = issue.get("location") |
| |
| if error_type == "syntax": |
| return { |
| "type": "fix_syntax", |
| "message": f"Fix the syntax error: {message}", |
| "location": location |
| } |
| elif error_type == "runtime": |
| return { |
| "type": "fix_runtime_error", |
| "message": f"Fix the runtime error: {message}", |
| "location": location |
| } |
| else: |
| return { |
| "type": "fix_error", |
| "message": f"Fix the error: {message}", |
| "error_type": error_type, |
| "location": location |
| } |
| |
| def _determine_focus_areas( |
| self, |
| issues: List[Dict[str, Any]], |
| solution: str, |
| result: EvaluationResult |
| ) -> List[str]: |
| """ |
| Determine focus areas based on issues and results. |
| |
| Args: |
| issues: The identified issues |
| solution: The solution code |
| result: The evaluation results |
| |
| Returns: |
| List of focus areas |
| """ |
| focus_areas = [] |
| |
| |
| syntax_issues = [i for i in issues if i.get("error_type") == "syntax"] |
| if syntax_issues: |
| focus_areas.append("syntax") |
| |
| |
| test_issues = [i for i in issues if i["type"] == "test_failure"] |
| if test_issues: |
| if any("expected" in i and "actual" in i for i in test_issues): |
| focus_areas.append("logic") |
| else: |
| focus_areas.append("functionality") |
| |
| |
| if result.metrics and "execution_time" in result.metrics: |
| if result.metrics["execution_time"] > self.config.get("performance_threshold", 1.0): |
| focus_areas.append("performance") |
| |
| |
| if result.metrics and "code_complexity" in result.metrics: |
| if result.metrics["code_complexity"] > self.config.get("complexity_threshold", 0.7): |
| focus_areas.append("complexity") |
| |
| |
| if not focus_areas: |
| focus_areas.append("general") |
| |
| return focus_areas |
| |
| def _generate_adaptation_hints( |
| self, |
| solution: str, |
| result: EvaluationResult |
| ) -> List[Dict[str, Any]]: |
| """ |
| Generate hints about how the problem might adapt in the next iteration. |
| |
| Args: |
| solution: The solution code |
| result: The evaluation results |
| |
| Returns: |
| List of adaptation hints |
| """ |
| hints = [] |
| |
| |
| if result.score > 0.8: |
| hints.append({ |
| "type": "complexity_increase", |
| "message": "The problem may become more complex in the next iteration." |
| }) |
| |
| |
| if result.score > 0.9 and self.state.evolution_stage >= 1: |
| hints.append({ |
| "type": "requirement_change", |
| "message": "The requirements may change in the next iteration." |
| }) |
| |
| |
| if result.score > 0.95: |
| hints.append({ |
| "type": "new_bugs", |
| "message": "New, more subtle bugs may be introduced in the next iteration." |
| }) |
| |
| |
| if result.score > 0.7 and result.score < 0.95: |
| focus_areas = result.metrics.get("focus_areas", []) |
| if focus_areas: |
| hints.append({ |
| "type": "focus_shift", |
| "message": f"The next iteration may focus more on: {', '.join(focus_areas)}", |
| "areas": focus_areas |
| }) |
| |
| return hints |
| |
| def _generate_description(self, state: ProblemState) -> str: |
| """ |
| Generate a description for the current problem state. |
| |
| Args: |
| state: The problem state |
| |
| Returns: |
| A descriptive prompt for the problem |
| """ |
| |
| base_desc = ( |
| f"Fix the bug(s) in the following code. " |
| f"This is iteration {state.evolution_stage + 1} of the task." |
| ) |
| |
| |
| if "bug_categories" in state.code_context: |
| categories = state.code_context["bug_categories"] |
| if categories: |
| base_desc += f"\n\nThe code contains the following types of issues: {', '.join(categories)}." |
| |
| |
| if state.requirements: |
| base_desc += "\n\nRequirements:" |
| for i, req in enumerate(state.requirements): |
| base_desc += f"\n{i+1}. {req['description']}" |
| |
| |
| difficulty_desc = "easy" |
| if state.difficulty > 0.3 and state.difficulty <= 0.6: |
| difficulty_desc = "moderate" |
| elif state.difficulty > 0.6 and state.difficulty <= 0.8: |
| difficulty_desc = "challenging" |
| elif state.difficulty > 0.8: |
| difficulty_desc = "very challenging" |
| |
| base_desc += f"\n\nThis is a {difficulty_desc} bug fixing task." |
| |
| return base_desc |
| |
| def _generate_focused_description(self, state: ProblemState, issues: List[Dict[str, Any]]) -> str: |
| """ |
| Generate a description focused on remaining issues. |
| |
| Args: |
| state: The problem state |
| issues: The identified issues |
| |
| Returns: |
| A descriptive prompt focused on remaining issues |
| """ |
| base_desc = self._generate_description(state) |
| |
| |
| if issues: |
| base_desc += "\n\nFocus on the following issues:" |
| for i, issue in enumerate(issues): |
| if issue["type"] == "test_failure": |
| base_desc += f"\n{i+1}. Test failure in '{issue['test']}': {issue['message']}" |
| else: |
| base_desc += f"\n{i+1}. {issue['error_type']} error: {issue['message']}" |
| |
| |
| if "focus_areas" in state.code_context: |
| areas = state.code_context["focus_areas"] |
| if areas: |
| base_desc += f"\n\nPay particular attention to: {', '.join(areas)}." |
| |
| return base_desc |
| |
| def _generate_guided_description( |
| self, |
| state: ProblemState, |
| issues: List[Dict[str, Any]], |
| suggestions: List[Dict[str, Any]] |
| ) -> str: |
| """ |
| Generate a description with added guidance. |
| |
| Args: |
| state: The problem state |
| issues: The identified issues |
| suggestions: The suggested fixes |
| |
| Returns: |
| A descriptive prompt with added guidance |
| """ |
| base_desc = self._generate_description(state) |
| |
| |
| if issues: |
| base_desc += "\n\nThe following issues were identified in your previous solution:" |
| for i, issue in enumerate(issues): |
| if issue["type"] == "test_failure": |
| base_desc += f"\n{i+1}. Test failure in '{issue['test']}': {issue['message']}" |
| if "expected" in issue and "actual" in issue: |
| base_desc += f"\n Expected: {issue['expected']}" |
| base_desc += f"\n Actual: {issue['actual']}" |
| else: |
| base_desc += f"\n{i+1}. {issue['error_type']} error: {issue['message']}" |
| if "location" in issue: |
| base_desc += f"\n Location: {issue['location']}" |
| |
| |
| if suggestions: |
| base_desc += "\n\nConsider the following suggestions:" |
| for i, suggestion in enumerate(suggestions): |
| base_desc += f"\n{i+1}. {suggestion['message']}" |
| |
| |
| if "hints" in state.code_context: |
| hints = state.code_context["hints"] |
| if hints: |
| base_desc += "\n\nHints:" |
| for i, hint in enumerate(hints): |
| base_desc += f"\n{i+1}. {hint}" |
| |
| return base_desc |
| |
| def _generate_hints( |
| self, |
| solution: str, |
| result: EvaluationResult, |
| feedback: Feedback |
| ) -> List[str]: |
| """ |
| Generate hints based on the solution and feedback. |
| |
| Args: |
| solution: The solution code |
| result: The evaluation results |
| feedback: The feedback provided |
| |
| Returns: |
| List of hints |
| """ |
| hints = [] |
| |
| |
| if result.test_results: |
| failing_tests = [ |
| test_name for test_name, test_result in result.test_results.items() |
| if not test_result["passed"] |
| ] |
| |
| if failing_tests: |
| test_hint = "Focus on fixing the failing tests" |
| |
| |
| for test_name in failing_tests[:2]: |
| test_result = result.test_results[test_name] |
| if "expected" in test_result and "actual" in test_result: |
| test_hint += f". For test '{test_name}', expected '{test_result['expected']}' but got '{test_result['actual']}'" |
| |
| hints.append(test_hint + ".") |
| |
| |
| if result.error_details: |
| for error_type, error_info in result.error_details.items(): |
| hints.append(f"Fix the {error_type} error: {error_info.get('message', 'Unknown error')}.") |
| |
| |
| for area in feedback.focus_areas: |
| if area == "syntax": |
| hints.append("Check your syntax carefully, especially parentheses, indentation, and function definitions.") |
| elif area == "logic": |
| hints.append("Review the logic of your solution, especially conditional statements and loop conditions.") |
| elif area == "functionality": |
| hints.append("Ensure your solution implements all required functionality specified in the tests.") |
| elif area == "performance": |
| hints.append("Consider optimizing your solution for better performance, avoid unnecessary operations.") |
| elif area == "complexity": |
| hints.append("Try to simplify your solution, it may be more complex than necessary.") |
| |
| return hints |
| |
| def _generate_test_hint(self, test_name: str, test_result: Dict[str, Any]) -> str: |
| """ |
| Generate a hint for a specific failing test. |
| |
| Args: |
| test_name: The name of the test |
| test_result: The test result |
| |
| Returns: |
| A hint for the test |
| """ |
| if "expected" in test_result and "actual" in test_result: |
| return f"The test expected '{test_result['expected']}' but got '{test_result['actual']}'" |
| elif "message" in test_result: |
| return test_result["message"] |
| else: |
| return "The test failed, but no detailed information is available." |
| |
| def _add_syntax_error(self, state: ProblemState, solution: str) -> None: |
| """ |
| Add a syntax error to the solution code. |
| |
| Args: |
| state: The problem state to modify |
| solution: The current solution |
| """ |
| lines = solution.split('\n') |
| if not lines: |
| return |
| |
| |
| idx = random.randint(0, len(lines) - 1) |
| line = lines[idx] |
| |
| |
| while not line.strip() or line.strip().startswith('#'): |
| idx = random.randint(0, len(lines) - 1) |
| line = lines[idx] |
| |
| |
| mod_type = random.choice([ |
| "remove_character", |
| "add_character", |
| "swap_characters", |
| "change_indent" |
| ]) |
| |
| if mod_type == "remove_character" and line: |
| char_idx = random.randint(0, len(line) - 1) |
| lines[idx] = line[:char_idx] + line[char_idx+1:] |
| |
| elif mod_type == "add_character": |
| char_idx = random.randint(0, len(line)) |
| char = random.choice(["(", ")", "{", "}", "[", "]", ":", ";", ",", "."]) |
| lines[idx] = line[:char_idx] + char + line[char_idx:] |
| |
| elif mod_type == "swap_characters" and len(line) >= 2: |
| char_idx = random.randint(0, len(line) - 2) |
| lines[idx] = (line[:char_idx] + line[char_idx+1] + |
| line[char_idx] + line[char_idx+2:]) |
| |
| elif mod_type == "change_indent": |
| |
| if line.startswith(" "): |
| lines[idx] = line[2:] |
| else: |
| lines[idx] = " " + line |
| |
| |
| modified_code = '\n'.join(lines) |
| state.code_context["code"] = modified_code |
| |
| |
| if "bugs" not in state.code_context: |
| state.code_context["bugs"] = [] |
| |
| state.code_context["bugs"].append({ |
| "type": "syntax", |
| "line": idx + 1, |
| "description": f"Syntax error introduced in line {idx + 1}" |
| }) |
| |
| def _add_logical_error(self, state: ProblemState, solution: str, parsed_solution: ast.Module) -> None: |
| """ |
| Add a logical error to the solution code. |
| |
| Args: |
| state: The problem state to modify |
| solution: The current solution |
| parsed_solution: The parsed AST of the solution |
| """ |
| modification_types = [ |
| "change_comparison", |
| "invert_condition", |
| "off_by_one", |
| "change_operator", |
| "reverse_logic" |
| ] |
| |
| mod_type = random.choice(modification_types) |
| lines = solution.split('\n') |
| |
| |
| if_statements = [] |
| for i, line in enumerate(lines): |
| if re.search(r'\bif\b|\bwhile\b|\bfor\b', line): |
| if_statements.append((i, line)) |
| |
| if if_statements: |
| |
| idx, line = random.choice(if_ |
| |
|
|
| if if_statements: |
| |
| idx, line = random.choice(if_statements) |
| |
| if mod_type == "change_comparison": |
| |
| comparisons = {"==": "!=", "!=": "==", ">": "<", "<": ">", ">=": "<=", "<=": ">="} |
| for op, new_op in comparisons.items(): |
| if op in line: |
| lines[idx] = line.replace(op, new_op, 1) |
| break |
| |
| elif mod_type == "invert_condition": |
| |
| if "not" in line: |
| lines[idx] = line.replace("not ", "", 1) |
| else: |
| match = re.search(r'(if|while)\s+([^:]+):', line) |
| if match: |
| condition = match.group(2) |
| lines[idx] = line.replace(condition, f"not ({condition})", 1) |
| |
| elif mod_type == "off_by_one": |
| |
| for op in ["+", "-"]: |
| if op in line: |
| |
| match = re.search(f'\\{op}\\s*(\\d+)', line) |
| if match: |
| num = int(match.group(1)) |
| new_num = num + 1 if op == "+" else max(0, num - 1) |
| lines[idx] = line.replace(f"{op} {num}", f"{op} {new_num}", 1) |
| break |
| |
| elif mod_type == "change_operator": |
| |
| operators = {"+": "-", "-": "+", "*": "/", "/": "*", "and": "or", "or": "and"} |
| for op, new_op in operators.items(): |
| if f" {op} " in line: |
| lines[idx] = line.replace(f" {op} ", f" {new_op} ", 1) |
| break |
| |
| elif mod_type == "reverse_logic": |
| |
| if " and " in line: |
| parts = line.split(" and ") |
| lines[idx] = line.replace(" and ".join(parts), " or ".join(parts), 1) |
| elif " or " in line: |
| parts = line.split(" or ") |
| lines[idx] = line.replace(" or ".join(parts), " and ".join(parts), 1) |
| |
| else: |
| |
| |
| assignments = [] |
| for i, line in enumerate(lines): |
| if "=" in line and "==" not in line and "!=" not in line: |
| assignments.append((i, line)) |
| |
| if assignments: |
| |
| idx, line = random.choice(assignments) |
| |
| |
| if "+" in line: |
| lines[idx] = line.replace("+", "-", 1) |
| elif "-" in line: |
| lines[idx] = line.replace("-", "+", 1) |
| elif "*" in line: |
| lines[idx] = line.replace("*", "/", 1) |
| elif "/" in line: |
| lines[idx] = line.replace("/", "*", 1) |
| else: |
| |
| match = re.search(r'=\s*(\d+)', line) |
| if match: |
| num = int(match.group(1)) |
| new_num = num + random.choice([-1, 1]) * random.randint(1, 3) |
| lines[idx] = line.replace(f"= {num}", f"= {new_num}", 1) |
| |
| |
| modified_code = '\n'.join(lines) |
| state.code_context["code"] = modified_code |
| |
| |
| if "bugs" not in state.code_context: |
| state.code_context["bugs"] = [] |
| |
| state.code_context["bugs"].append({ |
| "type": "logical", |
| "line": idx + 1, |
| "description": f"Logical error introduced in line {idx + 1}: {mod_type}" |
| }) |
| |
| def _add_performance_issue(self, state: ProblemState, solution: str, parsed_solution: ast.Module) -> None: |
| """ |
| Add a performance issue to the solution code. |
| |
| Args: |
| state: The problem state to modify |
| solution: The current solution |
| parsed_solution: The parsed AST of the solution |
| """ |
| lines = solution.split('\n') |
| |
| |
| loops = [] |
| for i, line in enumerate(lines): |
| if re.search(r'\bfor\b|\bwhile\b', line): |
| loops.append((i, line)) |
| |
| if loops: |
| |
| idx, line = random.choice(loops) |
| |
| |
| mod_type = random.choice([ |
| "add_nested_loop", |
| "replace_efficient_operation", |
| "add_redundant_computation" |
| ]) |
| |
| if mod_type == "add_nested_loop": |
| |
| indent = len(line) - len(line.lstrip()) |
| indent_str = ' ' * indent |
| loop_body_indent = indent_str + ' ' |
| |
| |
| end_idx = idx + 1 |
| while end_idx < len(lines) and (not lines[end_idx].strip() or len(lines[end_idx]) - len(lines[end_idx].lstrip()) > indent): |
| end_idx += 1 |
| |
| |
| insert_pos = end_idx |
| lines.insert(insert_pos, f"{loop_body_indent}for _ in range(100): # Unnecessary loop") |
| lines.insert(insert_pos + 1, f"{loop_body_indent} pass") |
| |
| elif mod_type == "replace_efficient_operation": |
| |
| |
| for i in range(idx + 1, min(idx + 10, len(lines))): |
| if "append" in lines[i] or "extend" in lines[i]: |
| indent = len(lines[i]) - len(lines[i].lstrip()) |
| indent_str = ' ' * indent |
| match = re.search(r'(\w+)\.(append|extend)', lines[i]) |
| if match: |
| list_name = match.group(1) |
| operation = match.group(2) |
| item = lines[i].split(f"{list_name}.{operation}(")[1].split(")")[0] |
| |
| if operation == "append": |
| |
| lines[i] = f"{indent_str}{list_name} = {list_name} + [{item}] # Less efficient than append" |
| elif operation == "extend": |
| |
| lines[i] = f"{indent_str}{list_name} = {list_name} + {item} # Less efficient than extend" |
| break |
| |
| elif mod_type == "add_redundant_computation": |
| |
| |
| if idx + 1 < len(lines): |
| body_indent = len(lines[idx + 1]) - len(lines[idx + 1].lstrip()) |
| body_indent_str = ' ' * body_indent |
| |
| |
| lines.insert(idx + 1, f"{body_indent_str}temp = [] # Redundant computation") |
| lines.insert(idx + 2, f"{body_indent_str}for i in range(1000):") |
| lines.insert(idx + 3, f"{body_indent_str} temp.append(i)") |
| lines.insert(idx + 4, f"{body_indent_str} temp.sort() # Unnecessary sort in each iteration") |
| |
| else: |
| |
| function_defs = [] |
| for i, line in enumerate(lines): |
| if line.strip().startswith("def "): |
| function_defs.append((i, line)) |
| |
| if function_defs: |
| |
| idx, line = random.choice(function_defs) |
| |
| |
| if idx + 1 < len(lines): |
| body_indent = len(lines[idx + 1]) - len(lines[idx + 1].lstrip()) |
| body_indent_str = ' ' * body_indent |
| |
| |
| lines.insert(idx + 1, f"{body_indent_str}# Inefficient data structure usage") |
| lines.insert(idx + 2, f"{body_indent_str}data = []") |
| lines.insert(idx + 3, f"{body_indent_str}for i in range(1000):") |
| lines.insert(idx + 4, f"{body_indent_str} data.append(i)") |
| lines.insert(idx + 5, f"{body_indent_str} # Inefficient search operation") |
| lines.insert(idx + 6, f"{body_indent_str} if i in data: # Linear search instead of using a set") |
| lines.insert(idx + 7, f"{body_indent_str} pass") |
| |
| |
| modified_code = '\n'.join(lines) |
| state.code_context["code"] = modified_code |
| |
| |
| if "bugs" not in state.code_context: |
| state.code_context["bugs"] = [] |
| |
| state.code_context["bugs"].append({ |
| "type": "performance", |
| "line": idx + 1, |
| "description": f"Performance issue introduced around line {idx + 1}" |
| }) |
| |
| def _add_edge_case_issue(self, state: ProblemState, solution: str, parsed_solution: ast.Module) -> None: |
| """ |
| Add an edge case issue to the solution code. |
| |
| Args: |
| state: The problem state to modify |
| solution: The current solution |
| parsed_solution: The parsed AST of the solution |
| """ |
| lines = solution.split('\n') |
| |
| |
| functions = [] |
| current_func = None |
| func_start = None |
| for i, line in enumerate(lines): |
| if line.strip().startswith("def "): |
| if current_func: |
| functions.append((func_start, i - 1, current_func)) |
| current_func = line.strip()[4:].split("(")[0] |
| func_start = i |
| elif i == len(lines) - 1 and current_func: |
| functions.append((func_start, i, current_func)) |
| |
| if functions: |
| |
| start_idx, end_idx, func_name = random.choice(functions) |
| |
| |
| mod_type = random.choice([ |
| "remove_boundary_check", |
| "introduce_zero_division", |
| "handling_empty_input", |
| "type_assumption" |
| ]) |
| |
| if mod_type == "remove_boundary_check": |
| |
| for i in range(start_idx, end_idx + 1): |
| if re.search(r'if\s+.*(?:len|count|size|length|empty|<=|>=|<|>|\!=)', lines[i]): |
| |
| lines[i] = f"# {lines[i]} # Boundary check removed" |
| |
| j = i + 1 |
| indent = len(lines[i]) - len(lines[i].lstrip()) |
| body_indent = indent + 4 |
| while j <= end_idx and (not lines[j].strip() or len(lines[j]) - len(lines[j].lstrip()) >= body_indent): |
| lines[j] = f"# {lines[j]}" |
| j += 1 |
| break |
| |
| elif mod_type == "introduce_zero_division": |
| |
| for i in range(start_idx, end_idx + 1): |
| if "/" in lines[i] and "try" not in lines[i] and "except" not in lines[i]: |
| |
| if re.search(r'if\s+.*(?:!=\s*0|>\s*0)', lines[i]): |
| lines[i] = f"# {lines[i]} # Denominator check removed" |
| else: |
| |
| match = re.search(r'(\w+)\s*/\s*(\w+)', lines[i]) |
| if match: |
| denominator = match.group(2) |
| |
| indent = len(lines[i]) - len(lines[i].lstrip()) |
| indent_str = ' ' * indent |
| lines.insert(i, f"{indent_str}if random.random() < 0.1: # Introduce potential zero division") |
| lines.insert(i + 1, f"{indent_str} {denominator} = 0") |
| break |
| |
| elif mod_type == "handling_empty_input": |
| |
| params = re.search(r'def\s+\w+\s*\((.*?)\)', lines[start_idx]) |
| if params and params.group(1): |
| param_list = [p.strip() for p in params.group(1).split(",")] |
| if param_list: |
| param = param_list[0].split("=")[0].strip() |
| |
| for i in range(start_idx + 1, end_idx + 1): |
| if re.search(rf'if\s+.*(?:not\s+{param}|len\s*\(\s*{param}\s*\)\s*==\s*0)', lines[i]): |
| |
| lines[i] = f"# {lines[i]} # Empty input check removed" |
| |
| j = i + 1 |
| indent = len(lines[i]) - len(lines[i].lstrip()) |
| body_indent = indent + 4 |
| while j <= end_idx and (not lines[j].strip() or len(lines[j]) - len(lines[j].lstrip()) >= body_indent): |
| lines[j] = f"# {lines[j]}" |
| j += 1 |
| break |
| |
| elif mod_type == "type_assumption": |
| |
| params = re.search(r'def\s+\w+\s*\((.*?)\)', lines[start_idx]) |
| if params and params.group(1): |
| param_list = [p.strip() for p in params.group(1).split(",")] |
| if param_list: |
| param = param_list[0].split("=")[0].strip() |
| |
| type_check_found = False |
| for i in range(start_idx + 1, end_idx + 1): |
| if re.search(rf'(?:isinstance|type)\s*\(\s*{param}\s*,', lines[i]): |
| |
| lines[i] = f"# {lines[i]} # Type check removed" |
| type_check_found = True |
| break |
| |
| if not type_check_found: |
| |
| indent = 4 |
| for i in range(start_idx + 1, min(start_idx + 5, end_idx + 1)): |
| if lines[i].strip(): |
| indent = len(lines[i]) - len(lines[i].lstrip()) |
| break |
| |
| indent_str = ' ' * indent |
| |
| lines.insert(start_idx + 1, f"{indent_str}# Assuming {param} is a specific type without checking") |
| lines.insert(start_idx + 2, f"{indent_str}{param}_length = len({param}) # Will fail if {param} doesn't support len()") |
| |
| |
| modified_code = '\n'.join(lines) |
| state.code_context["code"] = modified_code |
| |
| |
| if "bugs" not in state.code_context: |
| state.code_context["bugs"] = [] |
| |
| state.code_context["bugs"].append({ |
| "type": "edge_case", |
| "line": start_idx + 1, |
| "description": f"Edge case issue introduced in function '{func_name}': {mod_type}" |
| }) |
| |
| def _generate_new_requirement(self, state: ProblemState, solution: str) -> Dict[str, Any]: |
| """ |
| Generate a new requirement based on the current state and solution. |
| |
| Args: |
| state: The current problem state |
| solution: The current solution |
| |
| Returns: |
| A new requirement dictionary |
| """ |
| |
| function_names = re.findall(r'def\s+(\w+)', solution) |
| variable_names = re.findall(r'(\w+)\s*=', solution) |
| |
| |
| req_type = random.choice([ |
| "edge_case_handling", |
| "performance_improvement", |
| "error_handling", |
| "type_checking", |
| "feature_addition" |
| ]) |
| |
| if req_type == "edge_case_handling": |
| if function_names: |
| func_name = random.choice(function_names) |
| edge_cases = [ |
| "empty input", |
| "negative values", |
| "zero values", |
| "extremely large values", |
| "special characters", |
| "duplicate values" |
| ] |
| edge_case = random.choice(edge_cases) |
| return { |
| "type": "edge_case_handling", |
| "description": f"The function '{func_name}' should handle {edge_case} correctly.", |
| "difficulty": random.uniform(0.3, 0.7) |
| } |
| |
| elif req_type == "performance_improvement": |
| return { |
| "type": "performance_improvement", |
| "description": "The solution should be optimized to run in O(n) time or better.", |
| "difficulty": random.uniform(0.4, 0.8) |
| } |
| |
| elif req_type == "error_handling": |
| error_types = [ |
| "invalid input", |
| "division by zero", |
| "file not found", |
| "network timeout", |
| "permission denied" |
| ] |
| error_type = random.choice(error_types) |
| return { |
| "type": "error_handling", |
| "description": f"The code should handle {error_type} errors gracefully.", |
| "difficulty": random.uniform(0.2, 0.6) |
| } |
| |
| elif req_type == "type_checking": |
| if function_names: |
| func_name = random.choice(function_names) |
| return { |
| "type": "type_checking", |
| "description": f"The function '{func_name}' should validate input types before processing.", |
| "difficulty": random.uniform(0.1, 0.5) |
| } |
| |
| elif req_type == "feature_addition": |
| features = [ |
| "logging capability", |
| "progress tracking", |
| "caching for repeated operations", |
| "parameter validation", |
| "configuration options" |
| ] |
| feature = random.choice(features) |
| return { |
| "type": "feature_addition", |
| "description": f"Add {feature} to the solution.", |
| "difficulty": random.uniform(0.3, 0.7) |
| } |
| |
| |
| return { |
| "type": "general_improvement", |
| "description": "Improve the overall code quality and readability.", |
| "difficulty": random.uniform(0.1, 0.4) |
| } |
| |
| def _modify_requirement(self, requirement: Dict[str, Any], state: ProblemState, solution: str) -> Dict[str, Any]: |
| """ |
| Modify an existing requirement to make it more challenging. |
| |
| Args: |
| requirement: The requirement to modify |
| state: The current problem state |
| solution: The current solution |
| |
| Returns: |
| The modified requirement |
| """ |
| |
| modified_req = copy.deepcopy(requirement) |
| |
| |
| modified_req["difficulty"] = min(1.0, requirement.get("difficulty", 0.3) + random.uniform(0.1, 0.3)) |
| |
| |
| if requirement["type"] == "edge_case_handling": |
| modified_req["description"] += " Additionally, it should handle very large inputs efficiently." |
| |
| elif requirement["type"] == "performance_improvement": |
| modified_req["description"] = modified_req["description"].replace("O(n)", "O(log n)") |
| |
| elif requirement["type"] == "error_handling": |
| modified_req["description"] += " And provide detailed error messages for debugging." |
| |
| elif requirement["type"] == "type_checking": |
| modified_req["description"] += " And automatically convert types when possible." |
| |
| elif requirement["type"] == "feature_addition": |
| modified_req["description"] += " Ensure this feature is configurable via parameters." |
| |
| else: |
| modified_req["description"] += " The code should also be well-documented with comments." |
| |
| return modified_req |
| |
| def _add_edge_case_requirement(self, state: ProblemState) -> None: |
| """ |
| Add a requirement for handling edge cases. |
| |
| Args: |
| state: The problem state to modify |
| """ |
| edge_cases = [ |
| "empty collections", |
| "null/None values", |
| "boundary values (min/max)", |
| "negative numbers", |
| "special characters", |
| "Unicode characters", |
| "very large inputs", |
| "malformed input" |
| ] |
| |
| edge_case = random.choice(edge_cases) |
| |
| |
| state.requirements.append({ |
| "type": "edge_case_handling", |
| "description": f"The solution must correctly handle {edge_case}.", |
| "difficulty": random.uniform(0.3, 0.7) |
| }) |
| |
| |
| if "tests" in state.code_context: |
| |
| test_template = self._generate_edge_case_test(edge_case, state.code_context) |
| if test_template: |
| state.code_context["tests"].append({ |
| "name": f"test_edge_case_{len(state.code_context['tests'])}", |
| "content": test_template, |
| "description": f"Test handling of {edge_case}" |
| }) |
| |
| def _increase_data_volume(self, state: ProblemState, solution: str) -> None: |
| """ |
| Modify the problem to require handling larger data volumes. |
| |
| Args: |
| state: The problem state to modify |
| solution: The current solution |
| """ |
| |
| state.requirements.append({ |
| "type": "scalability", |
| "description": "The solution must efficiently handle large datasets (10,000+ items).", |
| "difficulty": random.uniform(0.5, 0.8) |
| }) |
| |
| |
| if "tests" in state.code_context: |
| for i, test in enumerate(state.code_context["tests"]): |
| content = test["content"] |
| |
| |
| for pattern, replacement in [ |
| (r'\[[^\]]{0,50}\]', '[random.randint(0, 1000) for _ in range(10000)]'), |
| (r'range\(\d+\)', 'range(10000)'), |
| (r'"[^"]{0,20}"', '"' + 'a' * 10000 + '"') |
| ]: |
| match = re.search(pattern, content) |
| if match and random.random() < 0.3: |
| content = content.replace(match.group(0), replacement, 1) |
| break |
| |
| state.code_context["tests"][i]["content"] = content |
| state.code_context["tests"][i]["description"] = f"{test.get('description', 'Test')} (with large data)" |
| |
| def _add_performance_constraint(self, state: ProblemState, solution: str) -> None: |
| """ |
| Add a performance constraint to the problem. |
| |
| Args: |
| state: The problem state to modify |
| solution: The current solution |
| """ |
| |
| constraints = [ |
| "linear time complexity (O(n))", |
| "logarithmic time complexity (O(log n))", |
| "constant memory usage (O(1) space)", |
| "execution time under 100ms for large inputs", |
| "minimal function calls" |
| ] |
| |
| constraint = random.choice(constraints) |
| |
| |
| state.requirements.append({ |
| "type": "performance", |
| "description": f"The solution must achieve {constraint}.", |
| "difficulty": random.uniform(0.6, 0.9) |
| }) |
| |
| |
| if "tests" in state.code_context: |
| |
| perf_test = self._generate_performance_test(constraint, state.code_context) |
| if perf_test: |
| state.code_context["tests"].append({ |
| "name": f"test_performance_{len(state.code_context['tests'])}", |
| "content": perf_test, |
| "description": f"Test {constraint}" |
| }) |
| |
| def _expand_functionality(self, state: ProblemState, solution: str) -> None: |
| """ |
| Expand the required functionality of the solution. |
| |
| Args: |
| state: The problem state to modify |
| solution: The current solution |
| """ |
| |
| expansions = [ |
| "support for different input types", |
| "parameterized behavior", |
| "additional output formats", |
| "flexible error handling", |
| "integration with external systems" |
| ] |
| |
| expansion = random.choice(expansions) |
| |
| |
| state.requirements.append({ |
| "type": "functionality", |
| "description": f"Expand the solution to include {expansion}.", |
| "difficulty": random.uniform(0.4, 0.8) |
| }) |
| |
| |
| if "tests" in state.code_context: |
| |
| test_template = self._generate_functionality_test(expansion, state.code_context) |
| if test_template: |
| state.code_context["tests"].append({ |
| "name": f"test_expanded_functionality_{len(state.code_context['tests'])}", |
| "content": test_template, |
| "description": f"Test {expansion}" |
| }) |
| |
| def _generate_default_test(self) -> str: |
| """ |
| Generate a default test based on the current problem state. |
| |
| Returns: |
| A default test script |
| """ |
| |
| return """ |
| import unittest |
| import sys |
| import os |
| |
| # Add the directory containing the solution to the path |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| |
| # Import the solution |
| from solution import * |
| |
| class DefaultTest(unittest.TestCase): |
| def test_basic_functionality(self): |
| # A basic test that should pass if the solution is correct |
| self.assertTrue(True, "Basic assertion failed") |
| |
| def test_expected_output(self): |
| # Test expected output of main functions |
| # This will need to be updated based on the specific problem |
| pass |
| |
| if __name__ == '__main__': |
| unittest.main() |
| """ |
| |
| def _generate_edge_case_test(self, edge_case: str, code_context: Dict[str, Any]) -> str: |
| """ |
| Generate a test for an edge case. |
| |
| Args: |
| edge_case: The edge case to test |
| code_context: The code context containing information about the problem |
| |
| Returns: |
| A test script for the edge case |
| """ |
| |
| function_names = [] |
| if "code" in code_context: |
| function_names = re.findall(r'def\s+(\w+)', code_context["code"]) |
| |
| if not function_names: |
| return None |
| |
| |
| function_name = random.choice(function_names) |
| |
| |
| if edge_case == "empty collections": |
| return f""" |
| import unittest |
| import sys |
| import os |
| |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| from solution import {function_name} |
| |
| class EmptyCollectionTest(unittest.TestCase): |
| def test_empty_input(self): |
| # Test with empty list |
| result = {function_name}([]) |
| self.assertIsNotNone(result, "Function should handle empty list") |
| |
| # Test with empty string |
| result = {function_name}("") |
| self.assertIsNotNone(result, "Function should handle empty string") |
| |
| # Test with empty dict |
| result = {function_name}({{}}) |
| self.assertIsNotNone(result, "Function should handle empty dict") |
| |
| if __name__ == '__main__': |
| unittest.main() |
| """ |
| elif edge_case == "null/None values": |
| return f""" |
| import unittest |
| import sys |
| import os |
| |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| from solution import {function_name} |
| |
| class NoneValueTest(unittest.TestCase): |
| def test_none_input(self): |
| # Test with None as input |
| result = {function_name}(None) |
| self.assertIsNotNone(result, "Function should handle None input") |
| |
| # Test with list containing None |
| result = {function_name}([1, None, 3]) |
| self.assertIsNotNone(result, "Function should handle list with None values") |
| |
| if __name__ == '__main__': |
| unittest.main() |
| """ |
| elif edge_case == "boundary values (min/max)": |
| return f""" |
| # recursive_swe_bench/task_generators/bug_fixing.py (completion) |
| |
| import unittest |
| import sys |
| import os |
| import sys |
| |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| from solution import {function_name} |
| |
| class BoundaryValueTest(unittest.TestCase): |
| def test_min_max_values(self): |
| # Test with minimum integer |
| min_int = -sys.maxsize - 1 |
| result = {function_name}(min_int) |
| self.assertIsNotNone(result, "Function should handle minimum integer") |
| |
| # Test with maximum integer |
| max_int = sys.maxsize |
| result = {function_name}(max_int) |
| self.assertIsNotNone(result, "Function should handle maximum integer") |
| |
| # Test with very large list |
| large_list = list(range(10000)) |
| result = {function_name}(large_list) |
| self.assertIsNotNone(result, "Function should handle very large inputs") |
| |
| if __name__ == '__main__': |
| unittest.main() |
| """ |
| elif edge_case == "negative numbers": |
| return f""" |
| import unittest |
| import sys |
| import os |
| |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| from solution import {function_name} |
| |
| class NegativeNumberTest(unittest.TestCase): |
| def test_negative_numbers(self): |
| # Test with negative number |
| result = {function_name}(-1) |
| self.assertIsNotNone(result, "Function should handle negative numbers") |
| |
| # Test with list of negative numbers |
| result = {function_name}([-1, -2, -3]) |
| self.assertIsNotNone(result, "Function should handle lists of negative numbers") |
| |
| # Test with mixed positive and negative |
| result = {function_name}([-1, 0, 1]) |
| self.assertIsNotNone(result, "Function should handle mixed positive and negative") |
| |
| if __name__ == '__main__': |
| unittest.main() |
| """ |
| else: |
| |
| return f""" |
| import unittest |
| import sys |
| import os |
| |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| from solution import {function_name} |
| |
| class EdgeCaseTest(unittest.TestCase): |
| def test_edge_case_{edge_case.replace(' ', '_')}(self): |
| # Test edge case: {edge_case} |
| # This is a placeholder test that needs to be customized for the specific edge case |
| self.assertTrue(True, "Edge case test not implemented") |
| |
| if __name__ == '__main__': |
| unittest.main() |
| """ |
| |
| def _generate_performance_test(self, constraint: str, code_context: Dict[str, Any]) -> str: |
| """ |
| Generate a performance test based on a constraint. |
| |
| Args: |
| constraint: The performance constraint |
| code_context: The code context containing information about the problem |
| |
| Returns: |
| A test script for the performance constraint |
| """ |
| |
| function_names = [] |
| if "code" in code_context: |
| function_names = re.findall(r'def\s+(\w+)', code_context["code"]) |
| |
| if not function_names: |
| return None |
| |
| |
| function_name = random.choice(function_names) |
| |
| if "time complexity" in constraint: |
| return f""" |
| import unittest |
| import sys |
| import os |
| import time |
| import random |
| |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| from solution import {function_name} |
| |
| class PerformanceTest(unittest.TestCase): |
| def test_time_complexity(self): |
| # Test for {constraint} |
| sizes = [100, 1000, 10000] |
| times = [] |
| |
| for size in sizes: |
| # Generate input of the given size |
| input_data = [random.randint(0, 1000) for _ in range(size)] |
| |
| # Measure execution time |
| start_time = time.time() |
| {function_name}(input_data) |
| end_time = time.time() |
| |
| times.append(end_time - start_time) |
| |
| # Check if time grows appropriately |
| # For O(n), time should grow linearly with input size |
| # For O(log n), time should grow logarithmically |
| # This is a simplified check and might need adjustment |
| if "log n" in "{constraint}": |
| # For logarithmic time, the ratio of times should decrease |
| ratio1 = times[1] / times[0] |
| ratio2 = times[2] / times[1] |
| self.assertLess(ratio2, ratio1 * 1.5, |
| f"Growth rate appears super-logarithmic: {times}") |
| else: # Assume linear or better |
| # For linear time, the ratio of times should be roughly equal to ratio of sizes |
| ratio1 = times[1] / times[0] |
| size_ratio1 = sizes[1] / sizes[0] |
| |
| ratio2 = times[2] / times[1] |
| size_ratio2 = sizes[2] / sizes[1] |
| |
| self.assertLess(ratio1, size_ratio1 * 1.5, |
| f"First growth rate appears super-linear: {times}") |
| self.assertLess(ratio2, size_ratio2 * 1.5, |
| f"Second growth rate appears super-linear: {times}") |
| |
| if __name__ == '__main__': |
| unittest.main() |
| """ |
| elif "execution time" in constraint: |
| return f""" |
| import unittest |
| import sys |
| import os |
| import time |
| import random |
| |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| from solution import {function_name} |
| |
| class PerformanceTest(unittest.TestCase): |
| def test_execution_time(self): |
| # Test for {constraint} |
| # Generate a large input |
| input_data = [random.randint(0, 1000) for _ in range(10000)] |
| |
| # Measure execution time |
| start_time = time.time() |
| {function_name}(input_data) |
| end_time = time.time() |
| |
| execution_time = (end_time - start_time) * 1000 # Convert to ms |
| |
| self.assertLess(execution_time, 100, |
| f"Execution time exceeded 100ms: {execution_time:.2f}ms") |
| |
| if __name__ == '__main__': |
| unittest.main() |
| """ |
| elif "memory usage" in constraint: |
| return f""" |
| import unittest |
| import sys |
| import os |
| import psutil |
| import random |
| |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| from solution import {function_name} |
| |
| class MemoryUsageTest(unittest.TestCase): |
| def test_memory_usage(self): |
| # Test for {constraint} |
| # Note: This is an approximate test and may not be accurate in all environments |
| |
| # Get current process |
| process = psutil.Process(os.getpid()) |
| |
| # Measure memory before |
| memory_before = process.memory_info().rss / 1024 / 1024 # MB |
| |
| # Generate a large input |
| input_data = [random.randint(0, 1000) for _ in range(100000)] |
| |
| # Run function |
| {function_name}(input_data) |
| |
| # Measure memory after |
| memory_after = process.memory_info().rss / 1024 / 1024 # MB |
| |
| # Calculate memory usage |
| memory_used = memory_after - memory_before |
| |
| # A crude approximation, adjust as needed |
| self.assertLess(memory_used, 10, |
| f"Memory usage seems high: {memory_used:.2f}MB") |
| |
| if __name__ == '__main__': |
| unittest.main() |
| """ |
| else: |
| |
| return f""" |
| import unittest |
| import sys |
| import os |
| import time |
| import random |
| |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| from solution import {function_name} |
| |
| class PerformanceTest(unittest.TestCase): |
| def test_performance(self): |
| # Test for {constraint} |
| # This is a placeholder test that needs to be customized for the specific constraint |
| |
| # Generate a large input |
| input_data = [random.randint(0, 1000) for _ in range(10000)] |
| |
| # Measure execution time |
| start_time = time.time() |
| {function_name}(input_data) |
| end_time = time.time() |
| |
| execution_time = end_time - start_time |
| |
| # Just log the time for now |
| print(f"Execution time: {execution_time:.4f} seconds") |
| self.assertTrue(True, "Performance test completed") |
| |
| if __name__ == '__main__': |
| unittest.main() |
| """ |
| |
| def _generate_functionality_test(self, expansion: str, code_context: Dict[str, Any]) -> str: |
| """ |
| Generate a test for expanded functionality. |
| |
| Args: |
| expansion: The functionality expansion |
| code_context: The code context containing information about the problem |
| |
| Returns: |
| A test script for the expanded functionality |
| """ |
| |
| function_names = [] |
| if "code" in code_context: |
| function_names = re.findall(r'def\s+(\w+)', code_context["code"]) |
| |
| if not function_names: |
| return None |
| |
| |
| function_name = random.choice(function_names) |
| |
| if "different input types" in expansion: |
| return f""" |
| import unittest |
| import sys |
| import os |
| import json |
| from collections import namedtuple |
| |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| from solution import {function_name} |
| |
| class InputTypesTest(unittest.TestCase): |
| def test_different_input_types(self): |
| # Test with different types of inputs |
| |
| # Test with list |
| list_input = [1, 2, 3] |
| list_result = {function_name}(list_input) |
| self.assertIsNotNone(list_result, "Function should handle list input") |
| |
| # Test with tuple |
| tuple_input = (1, 2, 3) |
| tuple_result = {function_name}(tuple_input) |
| self.assertIsNotNone(tuple_result, "Function should handle tuple input") |
| |
| # Test with set |
| set_input = {{1, 2, 3}} |
| set_result = {function_name}(set_input) |
| self.assertIsNotNone(set_result, "Function should handle set input") |
| |
| # Test with dictionary |
| dict_input = {{"a": 1, "b": 2, "c": 3}} |
| dict_result = {function_name}(dict_input) |
| self.assertIsNotNone(dict_result, "Function should handle dictionary input") |
| |
| # Test with JSON string |
| json_input = '{{"data": [1, 2, 3]}}' |
| json_result = {function_name}(json_input) |
| self.assertIsNotNone(json_result, "Function should handle JSON string") |
| |
| # Test with custom object |
| Point = namedtuple('Point', ['x', 'y']) |
| obj_input = Point(1, 2) |
| obj_result = {function_name}(obj_input) |
| self.assertIsNotNone(obj_result, "Function should handle custom object") |
| |
| if __name__ == '__main__': |
| unittest.main() |
| """ |
| elif "parameterized behavior" in expansion: |
| return f""" |
| import unittest |
| import sys |
| import os |
| |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| from solution import {function_name} |
| |
| class ParameterizedTest(unittest.TestCase): |
| def test_parameterized_behavior(self): |
| # Test function with different parameters |
| |
| # Base case with default parameters |
| base_input = [1, 2, 3] |
| base_result = {function_name}(base_input) |
| |
| # The function should now accept additional parameters |
| # These are example parameters, adjust based on the specific function |
| |
| # With sorting parameter |
| try: |
| sorted_result = {function_name}(base_input, sort=True) |
| self.assertIsNotNone(sorted_result, "Function should handle sort parameter") |
| except TypeError as e: |
| self.fail(f"Function does not support sort parameter: {{e}}") |
| |
| # With filtering parameter |
| try: |
| filtered_result = {function_name}(base_input, filter_fn=lambda x: x > 1) |
| self.assertIsNotNone(filtered_result, "Function should handle filter_fn parameter") |
| except TypeError as e: |
| self.fail(f"Function does not support filter_fn parameter: {{e}}") |
| |
| # With formatting parameter |
| try: |
| formatted_result = {function_name}(base_input, format="json") |
| self.assertIsNotNone(formatted_result, "Function should handle format parameter") |
| except TypeError as e: |
| self.fail(f"Function does not support format parameter: {{e}}") |
| |
| if __name__ == '__main__': |
| unittest.main() |
| """ |
| elif "additional output formats" in expansion: |
| return f""" |
| import unittest |
| import sys |
| import os |
| import json |
| |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| from solution import {function_name} |
| |
| class OutputFormatsTest(unittest.TestCase): |
| def test_output_formats(self): |
| # Test function with different output formats |
| input_data = [1, 2, 3] |
| |
| # Original format |
| original_result = {function_name}(input_data) |
| |
| # The function should now support different output formats |
| # These are example formats, adjust based on the specific function |
| |
| # JSON format |
| try: |
| json_result = {function_name}(input_data, format="json") |
| # Check if it's valid JSON |
| try: |
| json_obj = json.loads(json_result) if isinstance(json_result, str) else json_result |
| self.assertIsNotNone(json_obj, "JSON result should be valid") |
| except json.JSONDecodeError: |
| self.fail("JSON result is not valid") |
| except TypeError as e: |
| self.fail(f"Function does not support JSON format: {{e}}") |
| |
| # CSV format |
| try: |
| csv_result = {function_name}(input_data, format="csv") |
| self.assertIsNotNone(csv_result, "CSV result should not be None") |
| if isinstance(csv_result, str): |
| self.assertIn(",", csv_result, "CSV result should contain commas") |
| except TypeError as e: |
| self.fail(f"Function does not support CSV format: {{e}}") |
| |
| # XML format |
| try: |
| xml_result = {function_name}(input_data, format="xml") |
| self.assertIsNotNone(xml_result, "XML result should not be None") |
| if isinstance(xml_result, str): |
| self.assertIn("<", xml_result, "XML result should contain tags") |
| self.assertIn(">", xml_result, "XML result should contain tags") |
| except TypeError as e: |
| self.fail(f"Function does not support XML format: {{e}}") |
| |
| if __name__ == '__main__': |
| unittest.main() |
| """ |
| else: |
| |
| return f""" |
| import unittest |
| import sys |
| import os |
| |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| from solution import {function_name} |
| |
| class ExpandedFunctionalityTest(unittest.TestCase): |
| def test_expanded_functionality(self): |
| # Test for {expansion} |
| # This is a placeholder test that needs to be customized for the specific expansion |
| |
| # Basic test to verify the function exists |
| input_data = [1, 2, 3] |
| result = {function_name}(input_data) |
| self.assertIsNotNone(result, "Function should return a result") |
| |
| # You need to add specific tests for the expanded functionality |
| |
| if __name__ == '__main__': |
| unittest.main() |
| """ |
| |
| def _calculate_adaptation_vector(self, solution: str, result: EvaluationResult, feedback: Feedback) -> List[float]: |
| """ |
| Calculate an adaptation vector based on the solution, result, and feedback. |
| |
| The adaptation vector encodes how the problem should evolve in future iterations, |
| capturing dimensions like difficulty, bug type emphasis, and feedback focus. |
| |
| Args: |
| solution: The current solution |
| result: The evaluation results |
| feedback: The feedback provided |
| |
| Returns: |
| An adaptation vector (list of floats) |
| """ |
| |
| |
| |
| |
| |
| |
| |
| adaptation_vector = [0.0] * 5 |
| |
| |
| if result.score > 0.95: |
| adaptation_vector[0] = 0.2 |
| elif result.score > 0.8: |
| adaptation_vector[0] = 0.1 |
| elif result.score > 0.6: |
| adaptation_vector[0] = 0.0 |
| elif result.score > 0.4: |
| adaptation_vector[0] = -0.1 |
| else: |
| adaptation_vector[0] = -0.2 |
| |
| |
| syntax_issues = sum(1 for issue in feedback.issues if issue.get("error_type") == "syntax") |
| logical_issues = sum(1 for issue in feedback.issues if issue.get("type") == "test_failure") |
| |
| if syntax_issues > logical_issues: |
| adaptation_vector[1] = -0.1 |
| elif logical_issues > syntax_issues: |
| adaptation_vector[1] = 0.1 |
| |
| |
| if result.metrics and "execution_time" in result.metrics: |
| if result.metrics["execution_time"] > self.config.get("performance_threshold", 1.0): |
| adaptation_vector[2] = 0.2 |
| else: |
| adaptation_vector[2] = -0.1 |
| |
| |
| if result.test_results: |
| edge_case_failures = sum(1 for test_name, test_result in result.test_results.items() |
| if not test_result["passed"] and "edge" in test_name.lower()) |
| if edge_case_failures > 0: |
| adaptation_vector[3] = 0.2 |
| else: |
| adaptation_vector[3] = 0.0 |
| |
| |
| current_requirements = len(self.state.requirements) |
| if current_requirements < 3: |
| adaptation_vector[4] = 0.1 |
| elif current_requirements >= 5: |
| adaptation_vector[4] = -0.1 |
| |
| return adaptation_vector |
|
|
|
|
| class DefaultTestRunner: |
| """Default test runner for evaluating bug fixes.""" |
| |
| def run_tests(self, solution_file: Path, test_files: List[Path], code_context: Dict[str, Any]) -> Dict[str, Any]: |
| """ |
| Run tests against a solution file. |
| |
| Args: |
| solution_file: Path to the solution file |
| test_files: List of test file paths |
| code_context: Context information about the code |
| |
| Returns: |
| Dictionary of test results |
| """ |
| |
| results = { |
| "all_passed": True, |
| "passed_tests": 0, |
| "total_tests": 0, |
| "tests": {}, |
| "execution": { |
| "success": True, |
| "error": None, |
| "stdout": None, |
| "stderr": None |
| }, |
| "execution_time": 0.0 |
| } |
| |
| |
| try: |
| |
| if not solution_file.exists(): |
| results["execution"]["success"] = False |
| results["execution"]["error"] = "Solution file not found" |
| results["all_passed"] = False |
| return results |
| |
| |
| sys.path.insert(0, str(solution_file.parent)) |
| import importlib.util |
| spec = importlib.util.spec_from_file_location("solution", solution_file) |
| solution_module = importlib.util.module_from_spec(spec) |
| spec.loader.exec_module(solution_module) |
| |
| |
| if "required_functions" in code_context: |
| for func_name in code_context["required_functions"]: |
| if not hasattr(solution_module, func_name): |
| results["execution"]["success"] = False |
| results["execution"]["error"] = f"Required function '{func_name}' not found" |
| results["all_passed"] = False |
| return results |
| |
| except Exception as e: |
| results["execution"]["success"] = False |
| results["execution"]["error"] = str(e) |
| results["all_passed"] = False |
| return results |
| |
| |
| for test_file in test_files: |
| |
| if not test_file.exists(): |
| continue |
| |
| |
| import unittest |
| import io |
| from contextlib import redirect_stdout, redirect_stderr |
| |
| |
| loader = unittest.TestLoader() |
| try: |
| tests = loader.discover(str(test_file.parent), pattern=test_file.name) |
| |
| |
| test_cases = 0 |
| for suite in tests: |
| for test_case in suite: |
| test_cases += test_case.countTestCases() |
| |
| results["total_tests"] += test_cases |
| |
| |
| runner = unittest.TextTestRunner(verbosity=2) |
| |
| |
| stdout_buffer = io.StringIO() |
| stderr_buffer = io.StringIO() |
| |
| with redirect_stdout(stdout_buffer), redirect_stderr(stderr_buffer): |
| test_result = runner.run(tests) |
| |
| stdout = stdout_buffer.getvalue() |
| stderr = stderr_buffer.getvalue() |
| |
| |
| if not test_result.wasSuccessful(): |
| results["all_passed"] = False |
| |
| |
| passed_tests = test_cases - len(test_result.failures) - len(test_result.errors) |
| results["passed_tests"] += passed_tests |
| |
| |
| test_name = test_file.stem |
| results["tests"][test_name] = { |
| "passed": test_result.wasSuccessful(), |
| "failures": len(test_result.failures), |
| "errors": len(test_result.errors), |
| "skipped": len(test_result.skipped), |
| "total": test_cases, |
| "passed_count": passed_tests, |
| "stdout": stdout, |
| "stderr": stderr |
| } |
| |
| |
| for failure in test_result.failures: |
| test_id = failure[0].id() |
| failure_message = failure[1] |
| |
| |
| import re |
| expected_match = re.search(r'Expected\s*:(.+)', failure_message) |
| actual_match = re.search(r'Actual\s*:(.+)', failure_message) |
| |
| expected = expected_match.group(1).strip() if expected_match else None |
| actual = actual_match.group(1).strip() if actual_match else None |
| |
| if test_id not in results["tests"]: |
| results["tests"][test_id] = {} |
| |
| results["tests"][test_id].update({ |
| "passed": False, |
| "message": failure_message, |
| "expected": expected, |
| "actual": actual |
| }) |
| |
| except Exception as e: |
| |
| results["all_passed"] = False |
| results["tests"][test_file.stem] = { |
| "passed": False, |
| "error": str(e), |
| "failures": 1, |
| "errors": 1, |
| "skipped": 0, |
| "total": 1, |
| "passed_count": 0 |
| } |
| results["total_tests"] += 1 |
| |
| return results |
|
|
|
|
| class BugFixingTaskGenerator: |
| """Generator for bug fixing tasks.""" |
| |
| def __init__(self, config: Dict[str, Any] = None): |
| """ |
| Initialize the bug fixing task generator. |
| |
| Args: |
| config: Configuration options |
| """ |
| self.config = config or {} |
| self.difficulty_levels = self.config.get( |
| "difficulty_levels", |
| ["easy", "medium", "hard", "expert"] |
| ) |
| self.bug_categories = self.config.get( |
| "bug_categories", |
| [ |
| BugCategory.SYNTAX, |
| BugCategory.LOGICAL, |
| BugCategory.EDGE_CASE, |
| BugCategory.PERFORMANCE |
| ] |
| ) |
| self.test_templates = self._load_test_templates() |
| |
| def generate_task(self, difficulty: str = None, bug_categories: List[str] = None) -> BugFixingTask: |
| """ |
| Generate a new bug fixing task. |
| |
| Args: |
| difficulty: The difficulty level (easy, medium, hard, expert) |
| bug_categories: List of bug categories to include |
| |
| Returns: |
| A new bug fixing task |
| """ |
| |
| if difficulty is None: |
| difficulty = random.choice(self.difficulty_levels) |
| |
| |
| if bug_categories is None: |
| num_categories = random.randint(1, 3) |
| bug_categories = random.sample(self.bug_categories, num_categories) |
| |
| |
| problem_state = self._generate_problem_state(difficulty, bug_categories) |
| |
| |
| task_config = { |
| "difficulty": difficulty, |
| "bug_categories": bug_categories, |
| "convergence_criteria": { |
| "score_threshold": 0.95, |
| "min_iterations": 1, |
| "max_iterations": self.config.get("max_iterations", 5), |
| "score_delta_threshold": 0.05, |
| "consecutive_plateau_limit": 2 |
| }, |
| "score_weights": { |
| "test": 0.7, |
| "execution": 0.3 |
| }, |
| "performance_threshold": 1.0, |
| "complexity_threshold": 0.7 |
| } |
| |
| |
| return BugFixingTask(problem_state, task_config) |
| |
| def _generate_problem_state(self, difficulty: str, bug_categories: List[str]) -> ProblemState: |
| """ |
| Generate a problem state for the given difficulty and bug categories. |
| |
| Args: |
| difficulty: The difficulty level |
| bug_categories: List of bug categories |
| |
| Returns: |
| A problem state for the task |
| """ |
| |
| template = self._choose_template(difficulty, bug_categories) |
| |
| |
| problem_state = copy.deepcopy(template) |
| |
| |
| problem_state.problem_id = str(uuid.uuid4()) |
| |
| |
| problem_state.evolution_stage = 0 |
| problem_state.adaptation_vector = [0.0] * 5 |
| |
| |
| difficulty_values = { |
| "easy": 0.25, |
| "medium": 0.5, |
| "hard": 0.75, |
| "expert": 0.9 |
| } |
| problem_state.difficulty = difficulty_values.get(difficulty, 0.5) |
| |
| |
| for category in bug_categories: |
| self._insert_bug(problem_state, category) |
| |
| |
| problem_state.description = self._generate_description(problem_state) |
| |
| return problem_state |
| |
| def _choose_template(self, difficulty: str, bug_categories: List[str]) -> ProblemState: |
| """ |
| Choose a template that matches the difficulty and bug categories. |
| |
| Args: |
| difficulty: The difficulty level |
| bug_categories: List of bug categories |
| |
| Returns: |
| A template problem state |
| """ |
| |
| |
| |
| |
| code = self._generate_template_code(difficulty, bug_categories) |
| tests = self._generate_template_tests(code) |
| |
| |
| return ProblemState( |
| problem_id="template", |
| description="Fix the bugs in the given code.", |
| code_context={ |
| "code": code, |
| "tests": tests, |
| "bug_count": 0, |
| "bug_categories": [] |
| }, |
| requirements=[ |
| { |
| "type": "functional", |
| "description": "The code should pass all the provided tests.", |
| "difficulty": 0.3 |
| } |
| ], |
| difficulty=0.5, |
| evolution_stage=0, |
| adaptation_vector=[0.0] * 5 |
| ) |
| |
| def _generate_template_code(self, difficulty: str, bug_categories: List[str]) -> str: |
| """ |
| Generate template code based on difficulty and bug categories. |
| |
| Args: |
| difficulty: The difficulty level |
| bug_categories: List of bug categories |
| |
| Returns: |
| Template code |
| """ |
| |
| templates = { |
| "easy": """ |
| def calculate_sum(numbers): |
| \"\"\"Calculate the sum of a list of numbers.\"\"\" |
| total = 0 |
| for num in numbers: |
| total += num |
| return total |
| |
| def calculate_average(numbers): |
| \"\"\"Calculate the average of a list of numbers.\"\"\" |
| if not numbers: |
| return 0 |
| return calculate_sum(numbers) / len(numbers) |
| """, |
| "medium": """ |
| def find_most_frequent(items): |
| \"\"\"Find the most frequently occurring item in |
| # recursive_swe_bench/task_generators/bug_fixing.py (template generation) |
| |
| def find_most_frequent(items): |
| """Find the most frequently occurring item in a list.""" |
| if not items: |
| return None |
| |
| counts = {} |
| for item in items: |
| if item in counts: |
| counts[item] += 1 |
| else: |
| counts[item] = 1 |
| |
| max_count = 0 |
| max_item = None |
| for item, count in counts.items(): |
| if count > max_count: |
| max_count = count |
| max_item = item |
| |
| return max_item |
| |
| def binary_search(sorted_list, target): |
| """Perform binary search on a sorted list.""" |
| left = 0 |
| right = len(sorted_list) - 1 |
| |
| while left <= right: |
| mid = (left + right) // 2 |
| if sorted_list[mid] == target: |
| return mid |
| elif sorted_list[mid] < target: |
| left = mid + 1 |
| else: |
| right = mid - 1 |
| |
| return -1 # Target not found |
| """, |
| "hard": """ |
| def merge_sort(arr): |
| """Sort an array using the merge sort algorithm.""" |
| if len(arr) <= 1: |
| return arr |
| |
| # Split the array into two halves |
| mid = len(arr) // 2 |
| left_half = arr[:mid] |
| right_half = arr[mid:] |
| |
| # Recursively sort both halves |
| left_half = merge_sort(left_half) |
| right_half = merge_sort(right_half) |
| |
| # Merge the sorted halves |
| return merge(left_half, right_half) |
| |
| def merge(left, right): |
| """Merge two sorted arrays.""" |
| result = [] |
| i = j = 0 |
| |
| # Compare elements from both arrays and add the smaller one to the result |
| while i < len(left) and j < len(right): |
| if left[i] <= right[j]: |
| result.append(left[i]) |
| i += 1 |
| else: |
| result.append(right[j]) |
| j += 1 |
| |
| # Add any remaining elements |
| result.extend(left[i:]) |
| result.extend(right[j:]) |
| |
| return result |
| |
| def quicksort(arr): |
| """Sort an array using the quicksort algorithm.""" |
| if len(arr) <= 1: |
| return arr |
| |
| # Choose the pivot (using the first element for simplicity) |
| pivot = arr[0] |
| |
| # Partition the array |
| less = [x for x in arr[1:] if x <= pivot] |
| greater = [x for x in arr[1:] if x > pivot] |
| |
| # Recursively sort the partitions and combine |
| return quicksort(less) + [pivot] + quicksort(greater) |
| """, |
| "expert": """ |
| class Node: |
| """Node in a binary tree.""" |
| def __init__(self, value): |
| self.value = value |
| self.left = None |
| self.right = None |
| |
| def build_binary_tree(values): |
| """Build a binary tree from a list of values.""" |
| if not values: |
| return None |
| |
| root = Node(values[0]) |
| queue = [root] |
| i = 1 |
| |
| while queue and i < len(values): |
| node = queue.pop(0) |
| |
| # Add left child |
| if i < len(values) and values[i] is not None: |
| node.left = Node(values[i]) |
| queue.append(node.left) |
| i += 1 |
| |
| # Add right child |
| if i < len(values) and values[i] is not None: |
| node.right = Node(values[i]) |
| queue.append(node.right) |
| i += 1 |
| |
| return root |
| |
| def is_balanced(root): |
| """Check if a binary tree is balanced.""" |
| def height(node): |
| if not node: |
| return 0 |
| return max(height(node.left), height(node.right)) + 1 |
| |
| def is_balanced_helper(node): |
| if not node: |
| return True |
| |
| left_height = height(node.left) |
| right_height = height(node.right) |
| |
| if abs(left_height - right_height) > 1: |
| return False |
| |
| return is_balanced_helper(node.left) and is_balanced_helper(node.right) |
| |
| return is_balanced_helper(root) |
| |
| def find_lca(root, p, q): |
| """Find the lowest common ancestor of two nodes in a binary tree.""" |
| if not root: |
| return None |
| |
| if root.value == p or root.value == q: |
| return root |
| |
| left_lca = find_lca(root.left, p, q) |
| right_lca = find_lca(root.right, p, q) |
| |
| if left_lca and right_lca: |
| return root |
| |
| return left_lca if left_lca else right_lca |
| """ |
| } |
| |
| |
| if difficulty in templates: |
| return templates[difficulty] |
| else: |
| return templates["medium"] |
| |
| def _generate_template_tests(self, code: str) -> List[Dict[str, Any]]: |
| """ |
| Generate template tests based on the code. |
| |
| Args: |
| code: The template code |
| |
| Returns: |
| List of test dictionaries |
| """ |
| |
| function_names = re.findall(r'def\s+(\w+)', code) |
| |
| |
| tests = [] |
| for func_name in function_names: |
| test_content = self._generate_test_for_function(func_name) |
| if test_content: |
| tests.append({ |
| "name": f"test_{func_name}", |
| "content": test_content, |
| "description": f"Test for {func_name} function" |
| }) |
| |
| return tests |
| |
| def _generate_test_for_function(self, func_name: str) -> str: |
| """ |
| Generate a test for a specific function. |
| |
| Args: |
| func_name: The name of the function to test |
| |
| Returns: |
| Test content |
| """ |
| |
| if func_name in self.test_templates: |
| return self.test_templates[func_name] |
| |
| |
| if "sum" in func_name.lower(): |
| return """ |
| import unittest |
| import sys |
| import os |
| |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| from solution import calculate_sum |
| |
| class TestCalculateSum(unittest.TestCase): |
| def test_calculate_sum(self): |
| self.assertEqual(calculate_sum([1, 2, 3, 4, 5]), 15) |
| self.assertEqual(calculate_sum([]), 0) |
| self.assertEqual(calculate_sum([-1, -2, -3]), -6) |
| |
| if __name__ == '__main__': |
| unittest.main() |
| """ |
| elif "average" in func_name.lower(): |
| return """ |
| import unittest |
| import sys |
| import os |
| |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| from solution import calculate_average |
| |
| class TestCalculateAverage(unittest.TestCase): |
| def test_calculate_average(self): |
| self.assertEqual(calculate_average([1, 2, 3, 4, 5]), 3) |
| self.assertEqual(calculate_average([]), 0) |
| self.assertEqual(calculate_average([10]), 10) |
| |
| if __name__ == '__main__': |
| unittest.main() |
| """ |
| elif "frequent" in func_name.lower(): |
| return """ |
| import unittest |
| import sys |
| import os |
| |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| from solution import find_most_frequent |
| |
| class TestFindMostFrequent(unittest.TestCase): |
| def test_find_most_frequent(self): |
| self.assertEqual(find_most_frequent([1, 2, 2, 3, 3, 3, 4]), 3) |
| self.assertEqual(find_most_frequent(['a', 'b', 'a', 'c', 'a']), 'a') |
| self.assertIsNone(find_most_frequent([])) |
| self.assertEqual(find_most_frequent([5]), 5) |
| |
| if __name__ == '__main__': |
| unittest.main() |
| """ |
| elif "search" in func_name.lower(): |
| return """ |
| import unittest |
| import sys |
| import os |
| |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| from solution import binary_search |
| |
| class TestBinarySearch(unittest.TestCase): |
| def test_binary_search(self): |
| self.assertEqual(binary_search([1, 2, 3, 4, 5], 3), 2) |
| self.assertEqual(binary_search([1, 2, 3, 4, 5], 1), 0) |
| self.assertEqual(binary_search([1, 2, 3, 4, 5], 5), 4) |
| self.assertEqual(binary_search([1, 2, 3, 4, 5], 6), -1) |
| self.assertEqual(binary_search([], 5), -1) |
| |
| if __name__ == '__main__': |
| unittest.main() |
| """ |
| elif "sort" in func_name.lower(): |
| return """ |
| import unittest |
| import sys |
| import os |
| |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| from solution import {0} |
| |
| class Test{1}(unittest.TestCase): |
| def test_sorting(self): |
| self.assertEqual({0}([]), []) |
| self.assertEqual({0}([1]), [1]) |
| self.assertEqual({0}([3, 1, 4, 1, 5, 9, 2, 6, 5]), [1, 1, 2, 3, 4, 5, 5, 6, 9]) |
| self.assertEqual({0}([9, 8, 7, 6, 5, 4, 3, 2, 1]), [1, 2, 3, 4, 5, 6, 7, 8, 9]) |
| self.assertEqual({0}([1, 1, 1, 1]), [1, 1, 1, 1]) |
| |
| if __name__ == '__main__': |
| unittest.main() |
| """.format(func_name, func_name.title()) |
| elif "balanced" in func_name.lower(): |
| return """ |
| import unittest |
| import sys |
| import os |
| |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| from solution import Node, is_balanced |
| |
| class TestIsBalanced(unittest.TestCase): |
| def test_is_balanced(self): |
| # Create a balanced tree |
| # 1 |
| # / \\ |
| # 2 3 |
| # / \\ / \\ |
| # 4 5 6 7 |
| root = Node(1) |
| root.left = Node(2) |
| root.right = Node(3) |
| root.left.left = Node(4) |
| root.left.right = Node(5) |
| root.right.left = Node(6) |
| root.right.right = Node(7) |
| self.assertTrue(is_balanced(root)) |
| |
| # Create an unbalanced tree |
| # 1 |
| # / \\ |
| # 2 3 |
| # / \\ |
| # 4 5 |
| #/ |
| #6 |
| root = Node(1) |
| root.left = Node(2) |
| root.right = Node(3) |
| root.left.left = Node(4) |
| root.left.right = Node(5) |
| root.left.left.left = Node(6) |
| self.assertFalse(is_balanced(root)) |
| |
| # Empty tree is balanced |
| self.assertTrue(is_balanced(None)) |
| |
| if __name__ == '__main__': |
| unittest.main() |
| """ |
| elif "lca" in func_name.lower(): |
| return """ |
| import unittest |
| import sys |
| import os |
| |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| from solution import Node, find_lca |
| |
| class TestFindLCA(unittest.TestCase): |
| def test_find_lca(self): |
| # Create a tree |
| # 1 |
| # / \\ |
| # 2 3 |
| # / \\ / \\ |
| # 4 5 6 7 |
| root = Node(1) |
| root.left = Node(2) |
| root.right = Node(3) |
| root.left.left = Node(4) |
| root.left.right = Node(5) |
| root.right.left = Node(6) |
| root.right.right = Node(7) |
| |
| # Test cases |
| self.assertEqual(find_lca(root, 4, 5).value, 2) # LCA of 4 and 5 is 2 |
| self.assertEqual(find_lca(root, 4, 6).value, 1) # LCA of 4 and 6 is 1 |
| self.assertEqual(find_lca(root, 3, 7).value, 3) # LCA of 3 and 7 is 3 |
| self.assertEqual(find_lca(root, 2, 7).value, 1) # LCA of 2 and 7 is 1 |
| |
| if __name__ == '__main__': |
| unittest.main() |
| """ |
| elif "tree" in func_name.lower(): |
| return """ |
| import unittest |
| import sys |
| import os |
| |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| from solution import Node, build_binary_tree |
| |
| class TestBuildBinaryTree(unittest.TestCase): |
| def test_build_binary_tree(self): |
| # Test empty list |
| self.assertIsNone(build_binary_tree([])) |
| |
| # Test single node |
| root = build_binary_tree([1]) |
| self.assertEqual(root.value, 1) |
| self.assertIsNone(root.left) |
| self.assertIsNone(root.right) |
| |
| # Test complete tree |
| # 1 |
| # / \\ |
| # 2 3 |
| # / \\ / \\ |
| # 4 5 6 7 |
| values = [1, 2, 3, 4, 5, 6, 7] |
| root = build_binary_tree(values) |
| self.assertEqual(root.value, 1) |
| self.assertEqual(root.left.value, 2) |
| self.assertEqual(root.right.value, 3) |
| self.assertEqual(root.left.left.value, 4) |
| self.assertEqual(root.left.right.value, 5) |
| self.assertEqual(root.right.left.value, 6) |
| self.assertEqual(root.right.right.value, 7) |
| |
| # Test tree with None values |
| # 1 |
| # / \\ |
| # 2 3 |
| # / / |
| # 4 6 |
| values = [1, 2, 3, 4, None, 6, None] |
| root = build_binary_tree(values) |
| self.assertEqual(root.value, 1) |
| self.assertEqual(root.left.value, 2) |
| self.assertEqual(root.right.value, 3) |
| self.assertEqual(root.left.left.value, 4) |
| self.assertIsNone(root.left.right) |
| self.assertEqual(root.right.left.value, 6) |
| self.assertIsNone(root.right.right) |
| |
| if __name__ == '__main__': |
| unittest.main() |
| """ |
| else: |
| |
| return """ |
| import unittest |
| import sys |
| import os |
| |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| from solution import {0} |
| |
| class Test{1}(unittest.TestCase): |
| def test_{0}(self): |
| # TODO: Add specific test cases for {0} |
| # This is a placeholder test |
| self.assertTrue(True) |
| |
| if __name__ == '__main__': |
| unittest.main() |
| """.format(func_name, func_name.title()) |
| |
| def _load_test_templates(self) -> Dict[str, str]: |
| """ |
| Load test templates for common functions. |
| |
| Returns: |
| Dictionary of test templates |
| """ |
| |
| return { |
| "calculate_sum": """ |
| import unittest |
| import sys |
| import os |
| |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| from solution import calculate_sum |
| |
| class TestCalculateSum(unittest.TestCase): |
| def test_calculate_sum(self): |
| self.assertEqual(calculate_sum([1, 2, 3, 4, 5]), 15) |
| self.assertEqual(calculate_sum([]), 0) |
| self.assertEqual(calculate_sum([-1, -2, -3]), -6) |
| |
| if __name__ == '__main__': |
| unittest.main() |
| """, |
| "calculate_average": """ |
| import unittest |
| import sys |
| import os |
| |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| from solution import calculate_average |
| |
| class TestCalculateAverage(unittest.TestCase): |
| def test_calculate_average(self): |
| self.assertEqual(calculate_average([1, 2, 3, 4, 5]), 3) |
| self.assertEqual(calculate_average([]), 0) |
| self.assertEqual(calculate_average([10]), 10) |
| |
| if __name__ == '__main__': |
| unittest.main() |
| """ |
| } |
| |
| def _insert_bug(self, problem_state: ProblemState, bug_category: str) -> None: |
| """ |
| Insert a bug of the specified category into the problem state. |
| |
| Args: |
| problem_state: The problem state to modify |
| bug_category: The category of bug to insert |
| """ |
| if "code" not in problem_state.code_context: |
| return |
| |
| |
| code = problem_state.code_context["code"] |
| try: |
| parsed_code = ast.parse(code) |
| except SyntaxError: |
| |
| return |
| |
| |
| if bug_category == BugCategory.SYNTAX: |
| self._insert_syntax_bug(problem_state) |
| elif bug_category == BugCategory.LOGICAL: |
| self._insert_logical_bug(problem_state) |
| elif bug_category == BugCategory.PERFORMANCE: |
| self._insert_performance_bug(problem_state) |
| elif bug_category == BugCategory.EDGE_CASE: |
| self._insert_edge_case_bug(problem_state) |
| else: |
| |
| self._insert_logical_bug(problem_state) |
| |
| |
| if "bug_count" not in problem_state.code_context: |
| problem_state.code_context["bug_count"] = 0 |
| problem_state.code_context["bug_count"] += 1 |
| |
| if "bug_categories" not in problem_state.code_context: |
| problem_state.code_context["bug_categories"] = [] |
| if bug_category not in problem_state.code_context["bug_categories"]: |
| problem_state.code_context["bug_categories"].append(bug_category) |
| |
| def _insert_syntax_bug(self, problem_state: ProblemState) -> None: |
| """ |
| Insert a syntax bug into the problem state. |
| |
| Args: |
| problem_state: The problem state to modify |
| """ |
| code = problem_state.code_context["code"] |
| lines = code.split('\n') |
| if not lines: |
| return |
| |
| |
| idx = random.randint(0, len(lines) - 1) |
| line = lines[idx] |
| |
| |
| attempts = 0 |
| while (not line.strip() or line.strip().startswith('#')) and attempts < 10: |
| idx = random.randint(0, len(lines) - 1) |
| line = lines[idx] |
| attempts += 1 |
| |
| if attempts >= 10: |
| |
| for i, line in enumerate(lines): |
| if line.strip() and not line.strip().startswith('#'): |
| idx = i |
| break |
| else: |
| return |
| |
| |
| mod_type = random.choice([ |
| "remove_character", |
| "add_character", |
| "swap_characters", |
| "change_indent" |
| ]) |
| |
| if mod_type == "remove_character" and line: |
| char_idx = random.randint(0, len(line) - 1) |
| lines[idx] = line[:char_idx] + line[char_idx+1:] |
| |
| elif mod_type == "add_character": |
| char_idx = random.randint(0, len(line)) |
| char = random.choice(["(", ")", "{", "}", "[", "]", ":", ";", ",", "."]) |
| lines[idx] = line[:char_idx] + char + line[char_idx:] |
| |
| elif mod_type == "swap_characters" and len(line) >= 2: |
| char_idx = random.randint(0, len(line) - 2) |
| lines[idx] = (line[:char_idx] + line[char_idx+1] + |
| line[char_idx] + line[char_idx+2:]) |
| |
| elif mod_type == "change_indent": |
| |
| if line.startswith(" "): |
| lines[idx] = line[2:] |
| else: |
| lines[idx] = " " + line |
| |
| |
| problem_state.code_context["code"] = '\n'.join(lines) |
| |
| |
| if "bugs" not in problem_state.code_context: |
| problem_state.code_context["bugs"] = [] |
| |
| problem_state.code_context["bugs"].append({ |
| "type": BugCategory.SYNTAX, |
| "line": idx + 1, |
| "description": f"Syntax error introduced in line {idx + 1}" |
| }) |
| |
| def _insert_logical_bug(self, problem_state: ProblemState) -> None: |
| """ |
| Insert a logical bug into the problem state. |
| |
| Args: |
| problem_state: The problem state to modify |
| """ |
| code = problem_state.code_context["code"] |
| lines = code.split('\n') |
| if not lines: |
| return |
| |
| |
| if_statements = [] |
| for i, line in enumerate(lines): |
| if re.search(r'\bif\b|\bwhile\b|\bfor\b', line): |
| if_statements.append((i, line)) |
| |
| |
| mod_type = random.choice([ |
| "change_comparison", |
| "invert_condition", |
| "off_by_one", |
| "change_operator", |
| "reverse_logic" |
| ]) |
| |
| if if_statements: |
| |
| idx, line = random.choice(if_statements) |
| |
| if mod_type == "change_comparison": |
| |
| comparisons = {"==": "!=", "!=": "==", ">": "<", "<": ">", ">=": "<=", "<=": ">="} |
| for op, new_op in comparisons.items(): |
| if op in line: |
| lines[idx] = line.replace(op, new_op, 1) |
| break |
| |
| elif mod_type == "invert_condition": |
| |
| if "not" in line: |
| lines[idx] = line.replace("not ", "", 1) |
| else: |
| match = re.search(r'(if|while)\s+([^:]+):', line) |
| if match: |
| condition = match.group(2) |
| lines[idx] = line.replace(condition, f"not ({condition})", 1) |
| |
| elif mod_type == "off_by_one": |
| |
| for op in ["+", "-"]: |
| if op in line: |
| |
| match = re.search(f'\\{op}\\s*(\\d+)', line) |
| if match: |
| num = int(match.group(1)) |
| new_num = num + 1 if op == "+" else max(0, num - 1) |
| lines[idx] = line.replace(f"{op} {num}", f"{op} {new_num}", 1) |
| break |
| |
| elif mod_type == "change_operator": |
| |
| operators = {"+": "-", "-": "+", "*": "/", "/": "*", "and": "or", "or": "and"} |
| for op, new_op in operators.items(): |
| if f" {op} " in line: |
| lines[idx] = line.replace(f" {op} ", f" {new_op} ", 1) |
| break |
| |
| elif mod_type == "reverse_logic": |
| |
| if " and " in line: |
| parts = line.split(" and ") |
| lines[idx] = line.replace(" and ".join(parts), " or ".join(parts), 1) |
| elif " or " in line: |
| parts = line.split(" or ") |
| lines[idx] = line.replace(" or ".join(parts), " and ".join(parts), 1) |
| |
| else: |
| |
| |
| assignments = [] |
| for i, line in enumerate(lines): |
| if "=" in line and "==" not in line and "!=" not in line: |
| assignments.append((i, line)) |
| |
| if assignments: |
| |
| idx, line = random.choice(assignments) |
| |
| |
| if "+" in line: |
| lines[idx] = line.replace("+", "-", 1) |
| elif "-" in line: |
| lines[idx] = line.replace("-", "+", 1) |
| elif "*" in line: |
| lines[idx] = line.replace("*", "/", 1) |
| elif "/" in line: |
| lines[idx] = line.replace("/", "*", 1) |
| else: |
| |
| match = re.search(r'=\s*(\d+)', line) |
| if match: |
| num = int(match.group(1)) |
| new_num = num + random.choice([-1, 1]) * random.randint(1, 3) |
| lines[idx] = line.replace(f"= {num}", f"= {new_num}", 1) |
| |
| |
| problem_state.code_context["code"] = '\n'.join(lines) |
| |
| |
| if "bugs" not in problem_state.code_context: |
| problem_state.code_context["bugs"] = [] |
| |
| problem_state.code_context["bugs"].append({ |
| "type": BugCategory.LOGICAL, |
| "line": idx + 1, |
| "description": f"Logical error introduced in line {idx + 1}" |
| }) |
| |
| def _insert_performance_bug(self, problem_state: ProblemState) -> None: |
| """ |
| Insert a performance bug into the problem state. |
| |
| Args: |
| problem_state: The problem state to modify |
| """ |
| code = problem_state.code_context["code"] |
| lines = code.split('\n') |
| if not lines: |
| return |
| |
| |
| functions = [] |
| current_func = None |
| func_start = None |
| for i, line in enumerate(lines): |
| if line.strip().startswith("def "): |
| if current_func: |
| functions.append((func_start, i - 1, current_func)) |
| current_func = line.strip()[4:].split("(")[0] |
| func_start = i |
| elif i == len(lines) - 1 and current_func: |
| functions.append((func_start, i, current_func)) |
| |
| if not functions: |
| return |
| |
| |
| start_idx, end_idx, func_name = random.choice(functions) |
| |
| |
| mod_type = random.choice([ |
| "add_nested_loop", |
| "inefficient_data_structure", |
| "redundant_computation" |
| ]) |
| |
| if mod_type == "add_nested_loop": |
| |
| for i in range(start_idx + 1, end_idx + 1): |
| if lines[i].strip(): |
| indent = len(lines[i]) - len(lines[i].lstrip()) |
| break |
| else: |
| indent = 4 |
| |
| |
| for i in range(start_idx + 1, end_idx + 1): |
| if "for " in lines[i] or "while " in lines[i]: |
| |
| inner_indent = len(lines[i]) - len(lines[i].lstrip()) + 4 |
| inner_indent_str = ' ' * inner_indent |
| |
| |
| lines.insert(i + 1, f"{inner_indent_str}for _ in range(100): # Inefficient nested loop") |
| lines.insert(i + 2, f"{inner_indent_str} pass") |
| |
| |
| end_idx += 2 |
| break |
| else: |
| |
| inner_indent = indent + 4 |
| inner_indent_str = ' ' * inner_indent |
| |
| |
| for i in range(start_idx + 1, end_idx + 1): |
| if lines[i].strip() and not (lines[i].strip().startswith('"""') or lines[i].strip().startswith("'''")): |
| |
| lines.insert(i, f"{' ' * indent}for i in range(100): # Inefficient loop") |
| lines.insert(i + 1, f"{inner_indent_str}pass") |
| |
| |
| end_idx += 2 |
| break |
| |
| elif mod_type == "ineff |
| # recursive_swe_bench/task_generators/bug_fixing.py (finalized) |
| |
| elif mod_type == "inefficient_data_structure": |
| # Find indentation of the function |
| for i in range(start_idx + 1, end_idx + 1): |
| if lines[i].strip(): |
| indent = len(lines[i]) - len(lines[i].lstrip()) |
| break |
| else: |
| indent = 4 |
| |
| # Find a suitable place to add inefficient data structure usage |
| for i in range(start_idx + 1, end_idx + 1): |
| if "def " not in lines[i] and lines[i].strip(): |
| # Add inefficient data structure usage after this line |
| indent_str = ' ' * indent |
| |
| # Add inefficient code |
| lines.insert(i + 1, f"{indent_str} |
| lines.insert(i + 2, f"{indent_str}results = []") |
| lines.insert(i + 3, f"{indent_str}for i in range(1000): # Unnecessarily large range") |
| lines.insert(i + 4, f"{indent_str} # Using list instead of set for lookups") |
| lines.insert(i + 5, f"{indent_str} if i % 10 in results: # O(n) lookup instead of O(1)") |
| lines.insert(i + 6, f"{indent_str} results.append(i) # Unnecessary storage") |
| |
| |
| end_idx += 6 |
| break |
| |
| elif mod_type == "redundant_computation": |
| |
| for i in range(start_idx + 1, end_idx + 1): |
| if lines[i].strip(): |
| indent = len(lines[i]) - len(lines[i].lstrip()) |
| break |
| else: |
| indent = 4 |
| |
| |
| for i in range(start_idx + 1, end_idx + 1): |
| if "for " in lines[i] or "while " in lines[i]: |
| |
| inner_indent = len(lines[i]) - len(lines[i].lstrip()) + 4 |
| inner_indent_str = ' ' * inner_indent |
| |
| |
| lines.insert(i + 1, f"{inner_indent_str}# Redundant computation in each iteration") |
| lines.insert(i + 2, f"{inner_indent_str}temp_sum = 0") |
| lines.insert(i + 3, f"{inner_indent_str}for j in range(100): # Unnecessary nested computation") |
| lines.insert(i + 4, f"{inner_indent_str} temp_sum += j") |
| |
| |
| end_idx += 4 |
| break |
| |
| |
| problem_state.code_context["code"] = '\n'.join(lines) |
| |
| |
| if "bugs" not in problem_state.code_context: |
| problem_state.code_context["bugs"] = [] |
| |
| problem_state.code_context["bugs"].append({ |
| "type": BugCategory.PERFORMANCE, |
| "line": start_idx + 1, |
| "description": f"Performance issue introduced in function '{func_name}'" |
| }) |
| |
| def _insert_edge_case_bug(self, problem_state: ProblemState) -> None: |
| """ |
| Insert an edge case bug into the problem state. |
| |
| Args: |
| problem_state: The problem state to modify |
| """ |
| code = problem_state.code_context["code"] |
| lines = code.split('\n') |
| if not lines: |
| return |
| |
| |
| functions = [] |
| current_func = None |
| func_start = None |
| for i, line in enumerate(lines): |
| if line.strip().startswith("def "): |
| if current_func: |
| functions.append((func_start, i - 1, current_func)) |
| current_func = line.strip()[4:].split("(")[0] |
| func_start = i |
| elif i == len(lines) - 1 and current_func: |
| functions.append((func_start, i, current_func)) |
| |
| if not functions: |
| return |
| |
| |
| start_idx, end_idx, func_name = random.choice(functions) |
| |
| |
| mod_type = random.choice([ |
| "remove_boundary_check", |
| "missing_edge_case", |
| "type_assumption" |
| ]) |
| |
| if mod_type == "remove_boundary_check": |
| |
| boundary_checks = [] |
| for i in range(start_idx + 1, end_idx + 1): |
| if (re.search(r'if\s+.*(len|empty|<=|>=|<|>|==|!=)', lines[i]) and |
| (("if not " in lines[i]) or ("if len(" in lines[i]) or |
| ("if " in lines[i] and " == 0" in lines[i]) or |
| ("if " in lines[i] and " == []" in lines[i]) or |
| ("if " in lines[i] and " == ''" in lines[i]) or |
| ("if " in lines[i] and " is None" in lines[i]))): |
| boundary_checks.append(i) |
| |
| if boundary_checks: |
| |
| idx = random.choice(boundary_checks) |
| |
| |
| lines[idx] = f"# {lines[idx]} # Boundary check removed" |
| |
| |
| i = idx + 1 |
| while i <= end_idx and (not lines[i].strip() or len(lines[i]) - len(lines[i].lstrip()) > len(lines[idx]) - len(lines[idx].lstrip())): |
| lines[i] = f"# {lines[i]}" |
| i += 1 |
| else: |
| |
| |
| for i in range(start_idx + 1, end_idx + 1): |
| if lines[i].strip() and not (lines[i].strip().startswith('"""') or lines[i].strip().startswith("'''")): |
| indent = len(lines[i]) - len(lines[i].lstrip()) |
| indent_str = ' ' * indent |
| |
| |
| lines.insert(i, f"{indent_str}# Missing check for empty input") |
| lines.insert(i + 1, f"{indent_str}first_item = items[0] # Will fail on empty input") |
| |
| |
| end_idx += 2 |
| break |
| |
| elif mod_type == "missing_edge_case": |
| |
| for i in range(start_idx + 1, end_idx + 1): |
| if ("/" in lines[i] or |
| "if " in lines[i] and "==" in lines[i] or |
| "if " in lines[i] and "!=" in lines[i]): |
| |
| if "/" in lines[i] and not re.search(r'if\s+.*!=\s*0', lines[i-1]): |
| |
| indent = len(lines[i]) - len(lines[i].lstrip()) |
| indent_str = ' ' * indent |
| |
| |
| match = re.search(r'/\s*(\w+)', lines[i]) |
| if match: |
| denominator = match.group(1) |
| |
| |
| j = i - 1 |
| while j >= start_idx and len(lines[j]) - len(lines[j].lstrip()) >= indent: |
| if f"if {denominator}" in lines[j] and "== 0" in lines[j]: |
| lines[j] = f"# {lines[j]} # Zero division check removed" |
| j -= 1 |
| |
| |
| lines.insert(i, f"{indent_str}# Missing check for zero division") |
| |
| |
| end_idx += 1 |
| break |
| |
| elif ("==" in lines[i] or "!=" in lines[i]) and "None" not in lines[i]: |
| |
| lines[i] = f"# {lines[i]} # Edge case check removed" |
| break |
| else: |
| |
| |
| for i in range(start_idx + 1, end_idx + 1): |
| if lines[i].strip() and not (lines[i].strip().startswith('"""') or lines[i].strip().startswith("'''")): |
| indent = len(lines[i]) - len(lines[i].lstrip()) |
| indent_str = ' ' * indent |
| |
| |
| lines.insert(i, f"{indent_str}# Missing handling for edge cases") |
| lines.insert(i + 1, f"{indent_str}# This function doesn't handle special cases properly") |
| |
| |
| end_idx += 2 |
| break |
| |
| elif mod_type == "type_assumption": |
| |
| for i in range(start_idx + 1, end_idx + 1): |
| if re.search(r'for\s+\w+\s+in\s+\w+', lines[i]) or "=" in lines[i] and "[" in lines[i]: |
| |
| var_match = re.search(r'for\s+\w+\s+in\s+(\w+)', lines[i]) |
| if not var_match: |
| var_match = re.search(r'(\w+)\s*=', lines[i]) |
| |
| if var_match: |
| var_name = var_match.group(1) |
| indent = len(lines[i]) - len(lines[i].lstrip()) |
| indent_str = ' ' * indent |
| |
| |
| lines.insert(i + 1, f"{indent_str}# Type assumption: {var_name} is assumed to be a list") |
| lines.insert(i + 2, f"{indent_str}if len({var_name}) > 0: # Will fail if {var_name} doesn't support len()") |
| lines.insert(i + 3, f"{indent_str} first = {var_name}[0] # Will fail if {var_name} is not subscriptable") |
| |
| |
| end_idx += 3 |
| break |
| else: |
| |
| for i in range(start_idx + 1, end_idx + 1): |
| if lines[i].strip() and not (lines[i].strip().startswith('"""') or lines[i].strip().startswith("'''")): |
| indent = len(lines[i]) - len(lines[i].lstrip()) |
| indent_str = ' ' * indent |
| |
| |
| param_match = re.search(r'def\s+\w+\s*\(\s*(\w+)', lines[start_idx]) |
| param_name = param_match.group(1) if param_match else "input_data" |
| |
| |
| lines.insert(i, f"{indent_str}# Type assumption: {param_name} is assumed to be a specific type") |
| lines.insert(i + 1, f"{indent_str}{param_name}_str = str({param_name}) # Will fail if {param_name} can't be converted to string") |
| |
| |
| end_idx += 2 |
| break |
| |
| |
| problem_state.code_context["code"] = '\n'.join(lines) |
| |
| |
| if "bugs" not in problem_state.code_context: |
| problem_state.code_context["bugs"] = [] |
| |
| problem_state.code_context["bugs"].append({ |
| "type": BugCategory.EDGE_CASE, |
| "line": start_idx + 1, |
| "description": f"Edge case bug introduced in function '{func_name}'" |
| }) |
| |
| def _generate_description(self, problem_state: ProblemState) -> str: |
| """ |
| Generate a description for the current problem state. |
| |
| Args: |
| problem_state: The problem state |
| |
| Returns: |
| A descriptive prompt for the problem |
| """ |
| |
| bug_count = problem_state.code_context.get("bug_count", 0) |
| plural = "bugs" if bug_count != 1 else "bug" |
| |
| base_desc = ( |
| f"Fix the {plural} in the code below. " |
| f"There {'are' if bug_count != 1 else 'is'} {bug_count} {plural} to find and fix." |
| ) |
| |
| |
| if "bug_categories" in problem_state.code_context: |
| categories = problem_state.code_context["bug_categories"] |
| if categories: |
| category_desc = ", ".join(categories) |
| base_desc += f"\n\nThe code contains the following types of issues: {category_desc}." |
| |
| |
| if problem_state.requirements: |
| base_desc += "\n\nRequirements:" |
| for i, req in enumerate(problem_state.requirements): |
| base_desc += f"\n{i+1}. {req['description']}" |
| |
| |
| difficulty_desc = "easy" |
| if problem_state.difficulty > 0.3 and problem_state.difficulty <= 0.6: |
| difficulty_desc = "moderate" |
| elif problem_state.difficulty > 0.6 and problem_state.difficulty <= 0.8: |
| difficulty_desc = "challenging" |
| elif problem_state.difficulty > 0.8: |
| difficulty_desc = "very challenging" |
| |
| base_desc += f"\n\nThis is a {difficulty_desc} bug fixing task." |
| |
| return base_desc |
|
|
|
|
| |
| class DefaultTestRunner: |
| """ |
| Default test runner for evaluating solutions. |
| |
| This class runs tests against a solution file and collects the results. |
| """ |
| |
| def run_tests( |
| self, |
| solution_file: Path, |
| test_files: List[Path], |
| code_context: Dict[str, Any] |
| ) -> Dict[str, Any]: |
| """ |
| Run tests against a solution file. |
| |
| Args: |
| solution_file: Path to the solution file |
| test_files: List of test file paths |
| code_context: Additional context about the code |
| |
| Returns: |
| Dictionary containing test results |
| """ |
| |
| results = { |
| "all_passed": True, |
| "passed_tests": 0, |
| "total_tests": 0, |
| "tests": {}, |
| "execution": { |
| "success": True, |
| "error": None, |
| "stdout": "", |
| "stderr": "" |
| }, |
| "execution_time": 0.0 |
| } |
| |
| |
| if not solution_file.exists(): |
| results["execution"]["success"] = False |
| results["execution"]["error"] = f"Solution file not found: {solution_file}" |
| results["all_passed"] = False |
| return results |
| |
| |
| try: |
| start_time = time.time() |
| |
| |
| sys.path.insert(0, str(solution_file.parent)) |
| |
| |
| spec = importlib.util.spec_from_file_location( |
| "solution", solution_file) |
| solution_module = importlib.util.module_from_spec(spec) |
| spec.loader.exec_module(solution_module) |
| |
| |
| sys.path.pop(0) |
| |
| |
| end_time = time.time() |
| results["execution_time"] = end_time - start_time |
| |
| except Exception as e: |
| results["execution"]["success"] = False |
| results["execution"]["error"] = str(e) |
| results["all_passed"] = False |
| return results |
| |
| |
| for test_file in test_files: |
| |
| if not test_file.exists(): |
| continue |
| |
| try: |
| |
| loader = unittest.TestLoader() |
| |
| |
| sys.path.insert(0, str(test_file.parent)) |
| |
| |
| stdout_buffer = io.StringIO() |
| stderr_buffer = io.StringIO() |
| |
| |
| test_suite = loader.discover( |
| str(test_file.parent), |
| pattern=test_file.name |
| ) |
| |
| |
| test_count = 0 |
| for suite in test_suite: |
| for test_case in suite: |
| test_count += test_case.countTestCases() |
| |
| results["total_tests"] += test_count |
| |
| |
| with redirect_stdout(stdout_buffer), redirect_stderr(stderr_buffer): |
| test_runner = unittest.TextTestRunner(verbosity=2) |
| test_result = test_runner.run(test_suite) |
| |
| |
| stdout = stdout_buffer.getvalue() |
| stderr = stderr_buffer.getvalue() |
| |
| |
| sys.path.pop(0) |
| |
| |
| if not test_result.wasSuccessful(): |
| results["all_passed"] = False |
| |
| |
| passed_tests = test_count - len(test_result.failures) - len(test_result.errors) |
| results["passed_tests"] += passed_tests |
| |
| |
| test_name = test_file.stem |
| results["tests"][test_name] = { |
| "passed": test_result.wasSuccessful(), |
| "failures": len(test_result.failures), |
| "errors": len(test_result.errors), |
| "skipped": len(test_result.skipped), |
| "total": test_count, |
| "passed_count": passed_tests, |
| "stdout": stdout, |
| "stderr": stderr |
| } |
| |
| |
| for failure in test_result.failures + test_result.errors: |
| test_id = failure[0].id().split('.')[-1] |
| failure_message = failure[1] |
| |
| |
| expected_match = re.search(r'Expected\s*:(.+)', failure_message) |
| actual_match = re.search(r'Actual\s*:(.+)', failure_message) |
| |
| expected = expected_match.group(1).strip() if expected_match else None |
| actual = actual_match.group(1).strip() if actual_match else None |
| |
| if test_id not in results["tests"]: |
| results["tests"][test_id] = {} |
| |
| results["tests"][test_id].update({ |
| "passed": False, |
| "message": failure_message, |
| "expected": expected, |
| "actual": actual |
| }) |
| |
| except Exception as e: |
| |
| results["all_passed"] = False |
| test_name = test_file.stem |
| results["tests"][test_name] = { |
| "passed": False, |
| "error": str(e), |
| "failures": 0, |
| "errors": 1, |
| "skipped": 0, |
| "total": 1, |
| "passed_count": 0 |
| } |
| results["total_tests"] += 1 |
| |
| return results |
|
|