overview-env / tasks /definitions.py
hirann's picture
Upload tasks/definitions.py with huggingface_hub
abed3e5 verified
from typing import List, Dict, Any
from dataclasses import dataclass
@dataclass
class OverviewTask:
task_id: str
name: str
description: str
task_type: str
difficulty: str
input_text: str
question: str | None
expected_output: str | None
evaluation_criteria: Dict[str, Any]
max_steps: int
@dataclass
class GradingResult:
score: float
accuracy: float
quality: float
completeness: float
feedback: str
class OverviewTaskEvaluator:
def __init__(self, task: OverviewTask):
self.task = task
def grade(self, response: Dict[str, Any]) -> GradingResult:
if self.task.task_type == "summarization":
return self._grade_summarization(response)
elif self.task.task_type == "question_answering":
return self._grade_qa(response)
elif self.task.task_type == "code_analysis":
return self._grade_code_analysis(response)
elif self.task.task_type == "information_extraction":
return self._grade_info_extraction(response)
return GradingResult(0.0, 0.0, 0.0, 0.0, "Unknown task type")
def _grade_summarization(self, response: Dict[str, Any]) -> GradingResult:
summary = response.get("summary", "").lower()
criteria = self.task.evaluation_criteria
key_points = criteria.get("key_points", [])
found_points = sum(1 for pt in key_points if pt.lower() in summary)
completeness = found_points / len(key_points) if key_points else 0.5
word_count = len(summary.split())
target_length = criteria.get("target_length", 50)
quality = min(1.0, word_count / target_length) if target_length > 0 else 0.8
accuracy = 0.9 if summary.strip() else 0.0
score = accuracy * 0.3 + quality * 0.3 + completeness * 0.4
return GradingResult(score=score, accuracy=accuracy, quality=quality, completeness=completeness, feedback=f"Found {found_points}/{len(key_points)} key points")
def _grade_qa(self, response: Dict[str, Any]) -> GradingResult:
answer = response.get("answer", "").lower()
expected = self.task.expected_output.lower() if self.task.expected_output else ""
if expected:
answer_words = set(answer.split())
expected_words = set(expected.split())
accuracy = len(answer_words & expected_words) / max(len(expected_words), 1)
else:
accuracy = min(1.0, len(answer.split()) / 10) if answer.strip() else 0.0
quality = 0.9 if any(w in answer for w in ["because", "since", "therefore"]) else 0.6
completeness = 1.0 if answer.strip() else 0.0
score = accuracy * 0.5 + quality * 0.3 + completeness * 0.2
return GradingResult(score=score, accuracy=accuracy, quality=quality, completeness=completeness, feedback=f"Answer length: {len(answer.split())} words")
def _grade_code_analysis(self, response: Dict[str, Any]) -> GradingResult:
explanation = response.get("explanation", "").lower()
criteria = self.task.evaluation_criteria
required_concepts = criteria.get("concepts", [])
found_concepts = sum(1 for c in required_concepts if c.lower() in explanation)
completeness = found_concepts / len(required_concepts) if required_concepts else 0.5
quality = 0.9 if "example" in explanation or "e.g." in explanation else 0.7
accuracy = 0.85 if explanation.strip() else 0.0
score = accuracy * 0.4 + quality * 0.3 + completeness * 0.3
return GradingResult(score=score, accuracy=accuracy, quality=quality, completeness=completeness, feedback=f"Identified {found_concepts}/{len(required_concepts)} concepts")
def _grade_info_extraction(self, response: Dict[str, Any]) -> GradingResult:
extracted = response.get("extracted_info", {})
criteria = self.task.evaluation_criteria
fields = criteria.get("required_fields", [])
found_fields = sum(1 for f in fields if f in extracted and extracted[f])
completeness = found_fields / len(fields) if fields else 0.5
quality = 0.8 if isinstance(extracted, dict) else 0.5
accuracy = 0.85 if extracted else 0.0
score = accuracy * 0.3 + quality * 0.3 + completeness * 0.4
return GradingResult(score=score, accuracy=accuracy, quality=quality, completeness=completeness, feedback=f"Extracted {found_fields}/{len(fields)} fields")
def create_summarization_task() -> OverviewTask:
text = "Artificial Intelligence (AI) is rapidly transforming various industries, from healthcare to finance. In healthcare, AI algorithms can analyze medical images with remarkable accuracy, helping doctors detect diseases earlier. In finance, AI-powered systems detect fraudulent transactions in real-time, saving millions of dollars annually. However, AI also raises ethical concerns about privacy, bias, and job displacement."
return OverviewTask(task_id="summarize_article", name="Article Summarization", description="Provide a concise summary of the given article about AI", task_type="summarization", difficulty="easy", input_text=text, question=None, expected_output=None, evaluation_criteria={"key_points": ["transforming industries", "healthcare", "finance", "ethical concerns"], "target_length": 50}, max_steps=5)
def create_qa_task() -> OverviewTask:
text = "Python is a high-level programming language created by Guido van Rossum in 1991. It emphasizes code readability with its notable use of significant indentation. Python supports multiple programming paradigms, including structured, procedural, reflective, and object-oriented programming."
return OverviewTask(task_id="python_qa", name="Question Answering", description="Answer the question based on the provided text", task_type="question_answering", difficulty="easy", input_text=text, question="Who created Python and when?", expected_output="Guido van Rossum in 1991", evaluation_criteria={}, max_steps=3)
def create_code_analysis_task() -> OverviewTask:
code = '''def fibonacci(n):
if n <= 1:
return n
return fibonacci(n-1) + fibonacci(n-2)'''
return OverviewTask(task_id="code_analysis_fib", name="Code Analysis", description="Analyze the code and explain what it does", task_type="code_analysis", difficulty="medium", input_text=code, question=None, expected_output=None, evaluation_criteria={"concepts": ["recursion", "fibonacci", "exponential", "memoization"]}, max_steps=5)
def create_info_extraction_task() -> OverviewTask:
text = "John Doe is a software engineer at TechCorp. He was born on March 15, 1990, in San Francisco, California. His email is john.doe@techcorp.com and his phone number is (555) 123-4567."
return OverviewTask(task_id="extract_person_info", name="Information Extraction", description="Extract structured information about the person from the text", task_type="information_extraction", difficulty="easy", input_text=text, question=None, expected_output=None, evaluation_criteria={"required_fields": ["name", "email", "phone", "position"]}, max_steps=3)
def get_all_tasks() -> List[OverviewTask]:
return [create_summarization_task(), create_qa_task(), create_code_analysis_task(), create_info_extraction_task()]
def get_task_by_id(task_id: str) -> OverviewTask:
for task in get_all_tasks():
if task.task_id == task_id:
return task
raise ValueError(f"Task not found: {task_id}")