akseljoonas HF Staff commited on
Commit
4ce436f
·
1 Parent(s): 124a8a4

eval runs

Browse files
Files changed (3) hide show
  1. eval/evaluate.py +63 -64
  2. eval/models.py +6 -0
  3. evaluation_results.jsonl +0 -0
eval/evaluate.py CHANGED
@@ -1,9 +1,14 @@
1
- import asyncio
2
  import json
 
3
  from typing import Any, Dict
4
 
5
  import litellm
6
- from models import Correctness, JudgementResult
 
 
 
 
 
7
 
8
  # from: https://github.com/centerforaisafety/hle/blob/7b6be5aad6f9b43af3857de7867f3b52f6e4acb3/hle_eval/run_judge_results.py#L16-L33
9
  GRADER_TEMPLATE = """
@@ -30,12 +35,11 @@ confidence: The extracted confidence score between 0|%| and 100|%| from [respons
30
  CHOICE_STRINGS = ["yes", "no"]
31
 
32
 
33
- async def evaluate_single_response(
34
  question: str,
35
  response: str,
36
  correct_answer: str,
37
  model: str = "gpt-4o-mini",
38
- semaphore: asyncio.Semaphore = None,
39
  ) -> Dict[str, Any]:
40
  """
41
  Evaluate a single response against the ground truth using LLM as judge.
@@ -45,33 +49,16 @@ async def evaluate_single_response(
45
  response: The response to evaluate
46
  correct_answer: The ground truth answer
47
  model: The LLM model to use for judging
48
- semaphore: Semaphore for rate limiting
49
 
50
  Returns:
51
  Dictionary containing the judgement result and metadata
52
  """
53
- if semaphore:
54
- async with semaphore:
55
- return await _evaluate_single_response_impl(
56
- question, response, correct_answer, model
57
- )
58
- else:
59
- return await _evaluate_single_response_impl(
60
- question, response, correct_answer, model
61
- )
62
-
63
-
64
- async def _evaluate_single_response_impl(
65
- question: str, response: str, correct_answer: str, model: str
66
- ) -> Dict[str, Any]:
67
- """Internal implementation of single response evaluation"""
68
-
69
  prompt = GRADER_TEMPLATE.format(
70
  question=question, response=response, correct_answer=correct_answer
71
  )
72
 
73
  # Use litellm with structured output
74
- response = await litellm.acompletion(
75
  model=model,
76
  messages=[
77
  {
@@ -86,12 +73,12 @@ async def _evaluate_single_response_impl(
86
 
87
  # Parse structured output
88
  result: JudgementResult = JudgementResult.model_validate_json(
89
- response.choices[0].message.content
90
  )
91
  return result
92
 
93
 
94
- async def evaluate_dataset(
95
  input_file: str,
96
  eval_file: str,
97
  output_file: str = "evaluation_results.jsonl",
@@ -106,65 +93,77 @@ async def evaluate_dataset(
106
  input_file: Path to input JSONL file with QA pairs
107
  output_file: Path to output JSONL file for results
108
  model: The LLM model to use for judging
109
- max_concurrent: Maximum number of concurrent API calls
110
  limit: Optional limit on number of examples to evaluate
111
  """
112
- to_evaluate = [json.loads(line) for line in open(input_file, "r")]
 
 
 
113
  if limit:
114
  to_evaluate = to_evaluate[:limit]
115
 
116
  print(f"Loaded {len(to_evaluate)} QA pairs to evaluate")
117
 
118
- # Load dataset
119
  print(f"Loading ground truth from {eval_file}...")
120
  with open(eval_file, "r") as f:
121
- ground_truths = [json.loads(line) for line in f]
122
 
123
  print(f"Loaded {len(ground_truths)} ground truths")
124
 
125
- # Create semaphore for rate limiting
126
- semaphore = asyncio.Semaphore(max_concurrent)
127
-
128
- # Create evaluation tasks
129
- tasks = []
130
- for qa_pair, ground_truth in zip(to_evaluate, ground_truths):
131
- question = ground_truth.get("question", "")
132
- ground_truth = ground_truth.get("solution", "")
133
- response = qa_pair.get("solution", "")
134
-
135
- task = evaluate_single_response(
136
- response=response,
137
- question=question,
138
- correct_answer=ground_truth,
139
- model=model,
140
- semaphore=semaphore,
141
- )
142
- tasks.append(task)
143
-
144
- # Run evaluations in parallel
145
  print(f"Running evaluations with {max_concurrent} parallel workers...")
146
- results = await asyncio.gather(*tasks)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
- # Combine results with original data
149
- output_data = []
 
 
 
 
 
 
150
  correct_count = 0
151
  error_count = 0
152
 
153
  for qa_pair, result in zip(to_evaluate, results):
154
  print(result.model_dump_json())
155
- # output_entry = {**qa_pair, "evaluation": result}
156
- # output_data.append(output_entry)
 
 
 
 
 
157
 
158
  if result.correct == Correctness.yes:
159
  correct_count += 1
160
  else:
161
  error_count += 1
162
 
163
- # # Write results
164
- # print(f"Writing results to {output_file}...")
165
- # with open(output_file, "w") as f:
166
- # for entry in output_data:
167
- # f.write(entry.model_dump_json() + "\n")
168
 
169
  # Print summary
170
  total = len(to_evaluate)
@@ -186,17 +185,17 @@ async def evaluate_dataset(
186
  #
187
 
188
 
189
- async def main():
190
  """Main entry point for the evaluation script"""
191
- await evaluate_dataset(
192
- input_file="qa_pairs.jsonl",
193
- eval_file="qa_pairs.jsonl",
194
  output_file="evaluation_results.jsonl",
195
  model="gpt-4o-mini",
196
  max_concurrent=30,
197
- limit=100, # Set to None to evaluate all, or a number to limit
198
  )
199
 
200
 
201
  if __name__ == "__main__":
202
- asyncio.run(main())
 
 
1
  import json
2
+ from concurrent.futures import ThreadPoolExecutor, as_completed
3
  from typing import Any, Dict
4
 
5
  import litellm
6
+ from models import (
7
+ Correctness,
8
+ EvaluatedQuestionAndSolution,
9
+ JudgementResult,
10
+ QuestionAndSolution,
11
+ )
12
 
13
  # from: https://github.com/centerforaisafety/hle/blob/7b6be5aad6f9b43af3857de7867f3b52f6e4acb3/hle_eval/run_judge_results.py#L16-L33
14
  GRADER_TEMPLATE = """
 
35
  CHOICE_STRINGS = ["yes", "no"]
36
 
37
 
38
+ def evaluate_single_response(
39
  question: str,
40
  response: str,
41
  correct_answer: str,
42
  model: str = "gpt-4o-mini",
 
43
  ) -> Dict[str, Any]:
44
  """
45
  Evaluate a single response against the ground truth using LLM as judge.
 
49
  response: The response to evaluate
50
  correct_answer: The ground truth answer
51
  model: The LLM model to use for judging
 
52
 
53
  Returns:
54
  Dictionary containing the judgement result and metadata
55
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  prompt = GRADER_TEMPLATE.format(
57
  question=question, response=response, correct_answer=correct_answer
58
  )
59
 
60
  # Use litellm with structured output
61
+ llm_response = litellm.completion(
62
  model=model,
63
  messages=[
64
  {
 
73
 
74
  # Parse structured output
75
  result: JudgementResult = JudgementResult.model_validate_json(
76
+ llm_response.choices[0].message.content
77
  )
78
  return result
79
 
80
 
81
+ def evaluate_dataset(
82
  input_file: str,
83
  eval_file: str,
84
  output_file: str = "evaluation_results.jsonl",
 
93
  input_file: Path to input JSONL file with QA pairs
94
  output_file: Path to output JSONL file for results
95
  model: The LLM model to use for judging
96
+ max_concurrent: Maximum number of concurrent threads
97
  limit: Optional limit on number of examples to evaluate
98
  """
99
+ # Load input data as proper models
100
+ to_evaluate = [
101
+ QuestionAndSolution.model_validate_json(line) for line in open(input_file, "r")
102
+ ]
103
  if limit:
104
  to_evaluate = to_evaluate[:limit]
105
 
106
  print(f"Loaded {len(to_evaluate)} QA pairs to evaluate")
107
 
108
+ # Load ground truth dataset
109
  print(f"Loading ground truth from {eval_file}...")
110
  with open(eval_file, "r") as f:
111
+ ground_truths = [QuestionAndSolution.model_validate_json(line) for line in f]
112
 
113
  print(f"Loaded {len(ground_truths)} ground truths")
114
 
115
+ # Run evaluations in parallel using ThreadPoolExecutor
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  print(f"Running evaluations with {max_concurrent} parallel workers...")
117
+ results = []
118
+
119
+ with ThreadPoolExecutor(max_workers=max_concurrent) as executor:
120
+ # Submit all tasks
121
+ future_to_idx = {}
122
+ for idx, (qa_pair, ground_truth) in enumerate(zip(to_evaluate, ground_truths)):
123
+ question = ground_truth.question
124
+ ground_truth_answer = ground_truth.solution
125
+ response = qa_pair.solution
126
+
127
+ future = executor.submit(
128
+ evaluate_single_response,
129
+ response=response,
130
+ question=question,
131
+ correct_answer=ground_truth_answer,
132
+ model=model,
133
+ )
134
+ future_to_idx[future] = idx
135
 
136
+ # Collect results in order
137
+ results = [None] * len(to_evaluate)
138
+ for future in as_completed(future_to_idx):
139
+ idx = future_to_idx[future]
140
+ results[idx] = future.result()
141
+
142
+ # Combine results with original data using proper models
143
+ output_data: list[EvaluatedQuestionAndSolution] = []
144
  correct_count = 0
145
  error_count = 0
146
 
147
  for qa_pair, result in zip(to_evaluate, results):
148
  print(result.model_dump_json())
149
+
150
+ # Create proper evaluated model
151
+ output_entry = EvaluatedQuestionAndSolution(
152
+ **qa_pair.model_dump(),
153
+ evaluation=result
154
+ )
155
+ output_data.append(output_entry)
156
 
157
  if result.correct == Correctness.yes:
158
  correct_count += 1
159
  else:
160
  error_count += 1
161
 
162
+ # Write results using proper model serialization
163
+ print(f"Writing results to {output_file}...")
164
+ with open(output_file, "w") as f:
165
+ for entry in output_data:
166
+ f.write(entry.model_dump_json() + "\n")
167
 
168
  # Print summary
169
  total = len(to_evaluate)
 
185
  #
186
 
187
 
188
+ def main():
189
  """Main entry point for the evaluation script"""
190
+ evaluate_dataset(
191
+ input_file="eval/qa_pairs.jsonl",
192
+ eval_file="eval/qa_pairs.jsonl",
193
  output_file="evaluation_results.jsonl",
194
  model="gpt-4o-mini",
195
  max_concurrent=30,
196
+ limit=10, # Set to None to evaluate all, or a number to limit
197
  )
198
 
199
 
200
  if __name__ == "__main__":
201
+ main()
eval/models.py CHANGED
@@ -55,3 +55,9 @@ class EvaluationResult(BaseModel):
55
  success: bool
56
  judgement: JudgementResult | None = None
57
  error: str | None = None
 
 
 
 
 
 
 
55
  success: bool
56
  judgement: JudgementResult | None = None
57
  error: str | None = None
58
+
59
+
60
+ class EvaluatedQuestionAndSolution(QuestionAndSolution):
61
+ """Model for a QA pair with its evaluation result"""
62
+
63
+ evaluation: JudgementResult
evaluation_results.jsonl ADDED
The diff for this file is too large to render. See raw diff