File size: 7,450 Bytes
abed3e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
from typing import List, Dict, Any
from dataclasses import dataclass


@dataclass
class OverviewTask:
    task_id: str
    name: str
    description: str
    task_type: str
    difficulty: str
    input_text: str
    question: str | None
    expected_output: str | None
    evaluation_criteria: Dict[str, Any]
    max_steps: int


@dataclass
class GradingResult:
    score: float
    accuracy: float
    quality: float
    completeness: float
    feedback: str


class OverviewTaskEvaluator:
    def __init__(self, task: OverviewTask):
        self.task = task

    def grade(self, response: Dict[str, Any]) -> GradingResult:
        if self.task.task_type == "summarization":
            return self._grade_summarization(response)
        elif self.task.task_type == "question_answering":
            return self._grade_qa(response)
        elif self.task.task_type == "code_analysis":
            return self._grade_code_analysis(response)
        elif self.task.task_type == "information_extraction":
            return self._grade_info_extraction(response)
        return GradingResult(0.0, 0.0, 0.0, 0.0, "Unknown task type")

    def _grade_summarization(self, response: Dict[str, Any]) -> GradingResult:
        summary = response.get("summary", "").lower()
        criteria = self.task.evaluation_criteria
        key_points = criteria.get("key_points", [])
        found_points = sum(1 for pt in key_points if pt.lower() in summary)
        completeness = found_points / len(key_points) if key_points else 0.5
        word_count = len(summary.split())
        target_length = criteria.get("target_length", 50)
        quality = min(1.0, word_count / target_length) if target_length > 0 else 0.8
        accuracy = 0.9 if summary.strip() else 0.0
        score = accuracy * 0.3 + quality * 0.3 + completeness * 0.4
        return GradingResult(score=score, accuracy=accuracy, quality=quality, completeness=completeness, feedback=f"Found {found_points}/{len(key_points)} key points")

    def _grade_qa(self, response: Dict[str, Any]) -> GradingResult:
        answer = response.get("answer", "").lower()
        expected = self.task.expected_output.lower() if self.task.expected_output else ""
        if expected:
            answer_words = set(answer.split())
            expected_words = set(expected.split())
            accuracy = len(answer_words & expected_words) / max(len(expected_words), 1)
        else:
            accuracy = min(1.0, len(answer.split()) / 10) if answer.strip() else 0.0
        quality = 0.9 if any(w in answer for w in ["because", "since", "therefore"]) else 0.6
        completeness = 1.0 if answer.strip() else 0.0
        score = accuracy * 0.5 + quality * 0.3 + completeness * 0.2
        return GradingResult(score=score, accuracy=accuracy, quality=quality, completeness=completeness, feedback=f"Answer length: {len(answer.split())} words")

    def _grade_code_analysis(self, response: Dict[str, Any]) -> GradingResult:
        explanation = response.get("explanation", "").lower()
        criteria = self.task.evaluation_criteria
        required_concepts = criteria.get("concepts", [])
        found_concepts = sum(1 for c in required_concepts if c.lower() in explanation)
        completeness = found_concepts / len(required_concepts) if required_concepts else 0.5
        quality = 0.9 if "example" in explanation or "e.g." in explanation else 0.7
        accuracy = 0.85 if explanation.strip() else 0.0
        score = accuracy * 0.4 + quality * 0.3 + completeness * 0.3
        return GradingResult(score=score, accuracy=accuracy, quality=quality, completeness=completeness, feedback=f"Identified {found_concepts}/{len(required_concepts)} concepts")

    def _grade_info_extraction(self, response: Dict[str, Any]) -> GradingResult:
        extracted = response.get("extracted_info", {})
        criteria = self.task.evaluation_criteria
        fields = criteria.get("required_fields", [])
        found_fields = sum(1 for f in fields if f in extracted and extracted[f])
        completeness = found_fields / len(fields) if fields else 0.5
        quality = 0.8 if isinstance(extracted, dict) else 0.5
        accuracy = 0.85 if extracted else 0.0
        score = accuracy * 0.3 + quality * 0.3 + completeness * 0.4
        return GradingResult(score=score, accuracy=accuracy, quality=quality, completeness=completeness, feedback=f"Extracted {found_fields}/{len(fields)} fields")


def create_summarization_task() -> OverviewTask:
    text = "Artificial Intelligence (AI) is rapidly transforming various industries, from healthcare to finance. In healthcare, AI algorithms can analyze medical images with remarkable accuracy, helping doctors detect diseases earlier. In finance, AI-powered systems detect fraudulent transactions in real-time, saving millions of dollars annually. However, AI also raises ethical concerns about privacy, bias, and job displacement."
    return OverviewTask(task_id="summarize_article", name="Article Summarization", description="Provide a concise summary of the given article about AI", task_type="summarization", difficulty="easy", input_text=text, question=None, expected_output=None, evaluation_criteria={"key_points": ["transforming industries", "healthcare", "finance", "ethical concerns"], "target_length": 50}, max_steps=5)


def create_qa_task() -> OverviewTask:
    text = "Python is a high-level programming language created by Guido van Rossum in 1991. It emphasizes code readability with its notable use of significant indentation. Python supports multiple programming paradigms, including structured, procedural, reflective, and object-oriented programming."
    return OverviewTask(task_id="python_qa", name="Question Answering", description="Answer the question based on the provided text", task_type="question_answering", difficulty="easy", input_text=text, question="Who created Python and when?", expected_output="Guido van Rossum in 1991", evaluation_criteria={}, max_steps=3)


def create_code_analysis_task() -> OverviewTask:
    code = '''def fibonacci(n):
    if n <= 1:
        return n
    return fibonacci(n-1) + fibonacci(n-2)'''
    return OverviewTask(task_id="code_analysis_fib", name="Code Analysis", description="Analyze the code and explain what it does", task_type="code_analysis", difficulty="medium", input_text=code, question=None, expected_output=None, evaluation_criteria={"concepts": ["recursion", "fibonacci", "exponential", "memoization"]}, max_steps=5)


def create_info_extraction_task() -> OverviewTask:
    text = "John Doe is a software engineer at TechCorp. He was born on March 15, 1990, in San Francisco, California. His email is john.doe@techcorp.com and his phone number is (555) 123-4567."
    return OverviewTask(task_id="extract_person_info", name="Information Extraction", description="Extract structured information about the person from the text", task_type="information_extraction", difficulty="easy", input_text=text, question=None, expected_output=None, evaluation_criteria={"required_fields": ["name", "email", "phone", "position"]}, max_steps=3)


def get_all_tasks() -> List[OverviewTask]:
    return [create_summarization_task(), create_qa_task(), create_code_analysis_task(), create_info_extraction_task()]


def get_task_by_id(task_id: str) -> OverviewTask:
    for task in get_all_tasks():
        if task.task_id == task_id:
            return task
    raise ValueError(f"Task not found: {task_id}")