Spaces:
Paused
Paused
| from typing import List, Dict, Any | |
| from dataclasses import dataclass | |
| class OverviewTask: | |
| task_id: str | |
| name: str | |
| description: str | |
| task_type: str | |
| difficulty: str | |
| input_text: str | |
| question: str | None | |
| expected_output: str | None | |
| evaluation_criteria: Dict[str, Any] | |
| max_steps: int | |
| class GradingResult: | |
| score: float | |
| accuracy: float | |
| quality: float | |
| completeness: float | |
| feedback: str | |
| class OverviewTaskEvaluator: | |
| def __init__(self, task: OverviewTask): | |
| self.task = task | |
| def grade(self, response: Dict[str, Any]) -> GradingResult: | |
| if self.task.task_type == "summarization": | |
| return self._grade_summarization(response) | |
| elif self.task.task_type == "question_answering": | |
| return self._grade_qa(response) | |
| elif self.task.task_type == "code_analysis": | |
| return self._grade_code_analysis(response) | |
| elif self.task.task_type == "information_extraction": | |
| return self._grade_info_extraction(response) | |
| return GradingResult(0.0, 0.0, 0.0, 0.0, "Unknown task type") | |
| def _grade_summarization(self, response: Dict[str, Any]) -> GradingResult: | |
| summary = response.get("summary", "").lower() | |
| criteria = self.task.evaluation_criteria | |
| key_points = criteria.get("key_points", []) | |
| found_points = sum(1 for pt in key_points if pt.lower() in summary) | |
| completeness = found_points / len(key_points) if key_points else 0.5 | |
| word_count = len(summary.split()) | |
| target_length = criteria.get("target_length", 50) | |
| quality = min(1.0, word_count / target_length) if target_length > 0 else 0.8 | |
| accuracy = 0.9 if summary.strip() else 0.0 | |
| score = accuracy * 0.3 + quality * 0.3 + completeness * 0.4 | |
| return GradingResult(score=score, accuracy=accuracy, quality=quality, completeness=completeness, feedback=f"Found {found_points}/{len(key_points)} key points") | |
| def _grade_qa(self, response: Dict[str, Any]) -> GradingResult: | |
| answer = response.get("answer", "").lower() | |
| expected = self.task.expected_output.lower() if self.task.expected_output else "" | |
| if expected: | |
| answer_words = set(answer.split()) | |
| expected_words = set(expected.split()) | |
| accuracy = len(answer_words & expected_words) / max(len(expected_words), 1) | |
| else: | |
| accuracy = min(1.0, len(answer.split()) / 10) if answer.strip() else 0.0 | |
| quality = 0.9 if any(w in answer for w in ["because", "since", "therefore"]) else 0.6 | |
| completeness = 1.0 if answer.strip() else 0.0 | |
| score = accuracy * 0.5 + quality * 0.3 + completeness * 0.2 | |
| return GradingResult(score=score, accuracy=accuracy, quality=quality, completeness=completeness, feedback=f"Answer length: {len(answer.split())} words") | |
| def _grade_code_analysis(self, response: Dict[str, Any]) -> GradingResult: | |
| explanation = response.get("explanation", "").lower() | |
| criteria = self.task.evaluation_criteria | |
| required_concepts = criteria.get("concepts", []) | |
| found_concepts = sum(1 for c in required_concepts if c.lower() in explanation) | |
| completeness = found_concepts / len(required_concepts) if required_concepts else 0.5 | |
| quality = 0.9 if "example" in explanation or "e.g." in explanation else 0.7 | |
| accuracy = 0.85 if explanation.strip() else 0.0 | |
| score = accuracy * 0.4 + quality * 0.3 + completeness * 0.3 | |
| return GradingResult(score=score, accuracy=accuracy, quality=quality, completeness=completeness, feedback=f"Identified {found_concepts}/{len(required_concepts)} concepts") | |
| def _grade_info_extraction(self, response: Dict[str, Any]) -> GradingResult: | |
| extracted = response.get("extracted_info", {}) | |
| criteria = self.task.evaluation_criteria | |
| fields = criteria.get("required_fields", []) | |
| found_fields = sum(1 for f in fields if f in extracted and extracted[f]) | |
| completeness = found_fields / len(fields) if fields else 0.5 | |
| quality = 0.8 if isinstance(extracted, dict) else 0.5 | |
| accuracy = 0.85 if extracted else 0.0 | |
| score = accuracy * 0.3 + quality * 0.3 + completeness * 0.4 | |
| return GradingResult(score=score, accuracy=accuracy, quality=quality, completeness=completeness, feedback=f"Extracted {found_fields}/{len(fields)} fields") | |
| def create_summarization_task() -> OverviewTask: | |
| text = "Artificial Intelligence (AI) is rapidly transforming various industries, from healthcare to finance. In healthcare, AI algorithms can analyze medical images with remarkable accuracy, helping doctors detect diseases earlier. In finance, AI-powered systems detect fraudulent transactions in real-time, saving millions of dollars annually. However, AI also raises ethical concerns about privacy, bias, and job displacement." | |
| return OverviewTask(task_id="summarize_article", name="Article Summarization", description="Provide a concise summary of the given article about AI", task_type="summarization", difficulty="easy", input_text=text, question=None, expected_output=None, evaluation_criteria={"key_points": ["transforming industries", "healthcare", "finance", "ethical concerns"], "target_length": 50}, max_steps=5) | |
| def create_qa_task() -> OverviewTask: | |
| text = "Python is a high-level programming language created by Guido van Rossum in 1991. It emphasizes code readability with its notable use of significant indentation. Python supports multiple programming paradigms, including structured, procedural, reflective, and object-oriented programming." | |
| return OverviewTask(task_id="python_qa", name="Question Answering", description="Answer the question based on the provided text", task_type="question_answering", difficulty="easy", input_text=text, question="Who created Python and when?", expected_output="Guido van Rossum in 1991", evaluation_criteria={}, max_steps=3) | |
| def create_code_analysis_task() -> OverviewTask: | |
| code = '''def fibonacci(n): | |
| if n <= 1: | |
| return n | |
| return fibonacci(n-1) + fibonacci(n-2)''' | |
| return OverviewTask(task_id="code_analysis_fib", name="Code Analysis", description="Analyze the code and explain what it does", task_type="code_analysis", difficulty="medium", input_text=code, question=None, expected_output=None, evaluation_criteria={"concepts": ["recursion", "fibonacci", "exponential", "memoization"]}, max_steps=5) | |
| def create_info_extraction_task() -> OverviewTask: | |
| text = "John Doe is a software engineer at TechCorp. He was born on March 15, 1990, in San Francisco, California. His email is john.doe@techcorp.com and his phone number is (555) 123-4567." | |
| return OverviewTask(task_id="extract_person_info", name="Information Extraction", description="Extract structured information about the person from the text", task_type="information_extraction", difficulty="easy", input_text=text, question=None, expected_output=None, evaluation_criteria={"required_fields": ["name", "email", "phone", "position"]}, max_steps=3) | |
| def get_all_tasks() -> List[OverviewTask]: | |
| return [create_summarization_task(), create_qa_task(), create_code_analysis_task(), create_info_extraction_task()] | |
| def get_task_by_id(task_id: str) -> OverviewTask: | |
| for task in get_all_tasks(): | |
| if task.task_id == task_id: | |
| return task | |
| raise ValueError(f"Task not found: {task_id}") |