hirann commited on
Commit
abed3e5
·
verified ·
1 Parent(s): f440b93

Upload tasks/definitions.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. tasks/definitions.py +124 -0
tasks/definitions.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict, Any
2
+ from dataclasses import dataclass
3
+
4
+
5
+ @dataclass
6
+ class OverviewTask:
7
+ task_id: str
8
+ name: str
9
+ description: str
10
+ task_type: str
11
+ difficulty: str
12
+ input_text: str
13
+ question: str | None
14
+ expected_output: str | None
15
+ evaluation_criteria: Dict[str, Any]
16
+ max_steps: int
17
+
18
+
19
+ @dataclass
20
+ class GradingResult:
21
+ score: float
22
+ accuracy: float
23
+ quality: float
24
+ completeness: float
25
+ feedback: str
26
+
27
+
28
+ class OverviewTaskEvaluator:
29
+ def __init__(self, task: OverviewTask):
30
+ self.task = task
31
+
32
+ def grade(self, response: Dict[str, Any]) -> GradingResult:
33
+ if self.task.task_type == "summarization":
34
+ return self._grade_summarization(response)
35
+ elif self.task.task_type == "question_answering":
36
+ return self._grade_qa(response)
37
+ elif self.task.task_type == "code_analysis":
38
+ return self._grade_code_analysis(response)
39
+ elif self.task.task_type == "information_extraction":
40
+ return self._grade_info_extraction(response)
41
+ return GradingResult(0.0, 0.0, 0.0, 0.0, "Unknown task type")
42
+
43
+ def _grade_summarization(self, response: Dict[str, Any]) -> GradingResult:
44
+ summary = response.get("summary", "").lower()
45
+ criteria = self.task.evaluation_criteria
46
+ key_points = criteria.get("key_points", [])
47
+ found_points = sum(1 for pt in key_points if pt.lower() in summary)
48
+ completeness = found_points / len(key_points) if key_points else 0.5
49
+ word_count = len(summary.split())
50
+ target_length = criteria.get("target_length", 50)
51
+ quality = min(1.0, word_count / target_length) if target_length > 0 else 0.8
52
+ accuracy = 0.9 if summary.strip() else 0.0
53
+ score = accuracy * 0.3 + quality * 0.3 + completeness * 0.4
54
+ return GradingResult(score=score, accuracy=accuracy, quality=quality, completeness=completeness, feedback=f"Found {found_points}/{len(key_points)} key points")
55
+
56
+ def _grade_qa(self, response: Dict[str, Any]) -> GradingResult:
57
+ answer = response.get("answer", "").lower()
58
+ expected = self.task.expected_output.lower() if self.task.expected_output else ""
59
+ if expected:
60
+ answer_words = set(answer.split())
61
+ expected_words = set(expected.split())
62
+ accuracy = len(answer_words & expected_words) / max(len(expected_words), 1)
63
+ else:
64
+ accuracy = min(1.0, len(answer.split()) / 10) if answer.strip() else 0.0
65
+ quality = 0.9 if any(w in answer for w in ["because", "since", "therefore"]) else 0.6
66
+ completeness = 1.0 if answer.strip() else 0.0
67
+ score = accuracy * 0.5 + quality * 0.3 + completeness * 0.2
68
+ return GradingResult(score=score, accuracy=accuracy, quality=quality, completeness=completeness, feedback=f"Answer length: {len(answer.split())} words")
69
+
70
+ def _grade_code_analysis(self, response: Dict[str, Any]) -> GradingResult:
71
+ explanation = response.get("explanation", "").lower()
72
+ criteria = self.task.evaluation_criteria
73
+ required_concepts = criteria.get("concepts", [])
74
+ found_concepts = sum(1 for c in required_concepts if c.lower() in explanation)
75
+ completeness = found_concepts / len(required_concepts) if required_concepts else 0.5
76
+ quality = 0.9 if "example" in explanation or "e.g." in explanation else 0.7
77
+ accuracy = 0.85 if explanation.strip() else 0.0
78
+ score = accuracy * 0.4 + quality * 0.3 + completeness * 0.3
79
+ return GradingResult(score=score, accuracy=accuracy, quality=quality, completeness=completeness, feedback=f"Identified {found_concepts}/{len(required_concepts)} concepts")
80
+
81
+ def _grade_info_extraction(self, response: Dict[str, Any]) -> GradingResult:
82
+ extracted = response.get("extracted_info", {})
83
+ criteria = self.task.evaluation_criteria
84
+ fields = criteria.get("required_fields", [])
85
+ found_fields = sum(1 for f in fields if f in extracted and extracted[f])
86
+ completeness = found_fields / len(fields) if fields else 0.5
87
+ quality = 0.8 if isinstance(extracted, dict) else 0.5
88
+ accuracy = 0.85 if extracted else 0.0
89
+ score = accuracy * 0.3 + quality * 0.3 + completeness * 0.4
90
+ return GradingResult(score=score, accuracy=accuracy, quality=quality, completeness=completeness, feedback=f"Extracted {found_fields}/{len(fields)} fields")
91
+
92
+
93
+ def create_summarization_task() -> OverviewTask:
94
+ text = "Artificial Intelligence (AI) is rapidly transforming various industries, from healthcare to finance. In healthcare, AI algorithms can analyze medical images with remarkable accuracy, helping doctors detect diseases earlier. In finance, AI-powered systems detect fraudulent transactions in real-time, saving millions of dollars annually. However, AI also raises ethical concerns about privacy, bias, and job displacement."
95
+ return OverviewTask(task_id="summarize_article", name="Article Summarization", description="Provide a concise summary of the given article about AI", task_type="summarization", difficulty="easy", input_text=text, question=None, expected_output=None, evaluation_criteria={"key_points": ["transforming industries", "healthcare", "finance", "ethical concerns"], "target_length": 50}, max_steps=5)
96
+
97
+
98
+ def create_qa_task() -> OverviewTask:
99
+ text = "Python is a high-level programming language created by Guido van Rossum in 1991. It emphasizes code readability with its notable use of significant indentation. Python supports multiple programming paradigms, including structured, procedural, reflective, and object-oriented programming."
100
+ return OverviewTask(task_id="python_qa", name="Question Answering", description="Answer the question based on the provided text", task_type="question_answering", difficulty="easy", input_text=text, question="Who created Python and when?", expected_output="Guido van Rossum in 1991", evaluation_criteria={}, max_steps=3)
101
+
102
+
103
+ def create_code_analysis_task() -> OverviewTask:
104
+ code = '''def fibonacci(n):
105
+ if n <= 1:
106
+ return n
107
+ return fibonacci(n-1) + fibonacci(n-2)'''
108
+ return OverviewTask(task_id="code_analysis_fib", name="Code Analysis", description="Analyze the code and explain what it does", task_type="code_analysis", difficulty="medium", input_text=code, question=None, expected_output=None, evaluation_criteria={"concepts": ["recursion", "fibonacci", "exponential", "memoization"]}, max_steps=5)
109
+
110
+
111
+ def create_info_extraction_task() -> OverviewTask:
112
+ text = "John Doe is a software engineer at TechCorp. He was born on March 15, 1990, in San Francisco, California. His email is john.doe@techcorp.com and his phone number is (555) 123-4567."
113
+ return OverviewTask(task_id="extract_person_info", name="Information Extraction", description="Extract structured information about the person from the text", task_type="information_extraction", difficulty="easy", input_text=text, question=None, expected_output=None, evaluation_criteria={"required_fields": ["name", "email", "phone", "position"]}, max_steps=3)
114
+
115
+
116
+ def get_all_tasks() -> List[OverviewTask]:
117
+ return [create_summarization_task(), create_qa_task(), create_code_analysis_task(), create_info_extraction_task()]
118
+
119
+
120
+ def get_task_by_id(task_id: str) -> OverviewTask:
121
+ for task in get_all_tasks():
122
+ if task.task_id == task_id:
123
+ return task
124
+ raise ValueError(f"Task not found: {task_id}")