Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
Commit ·
4ce436f
1
Parent(s): 124a8a4
eval runs
Browse files- eval/evaluate.py +63 -64
- eval/models.py +6 -0
- evaluation_results.jsonl +0 -0
eval/evaluate.py
CHANGED
|
@@ -1,9 +1,14 @@
|
|
| 1 |
-
import asyncio
|
| 2 |
import json
|
|
|
|
| 3 |
from typing import Any, Dict
|
| 4 |
|
| 5 |
import litellm
|
| 6 |
-
from models import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
# from: https://github.com/centerforaisafety/hle/blob/7b6be5aad6f9b43af3857de7867f3b52f6e4acb3/hle_eval/run_judge_results.py#L16-L33
|
| 9 |
GRADER_TEMPLATE = """
|
|
@@ -30,12 +35,11 @@ confidence: The extracted confidence score between 0|%| and 100|%| from [respons
|
|
| 30 |
CHOICE_STRINGS = ["yes", "no"]
|
| 31 |
|
| 32 |
|
| 33 |
-
|
| 34 |
question: str,
|
| 35 |
response: str,
|
| 36 |
correct_answer: str,
|
| 37 |
model: str = "gpt-4o-mini",
|
| 38 |
-
semaphore: asyncio.Semaphore = None,
|
| 39 |
) -> Dict[str, Any]:
|
| 40 |
"""
|
| 41 |
Evaluate a single response against the ground truth using LLM as judge.
|
|
@@ -45,33 +49,16 @@ async def evaluate_single_response(
|
|
| 45 |
response: The response to evaluate
|
| 46 |
correct_answer: The ground truth answer
|
| 47 |
model: The LLM model to use for judging
|
| 48 |
-
semaphore: Semaphore for rate limiting
|
| 49 |
|
| 50 |
Returns:
|
| 51 |
Dictionary containing the judgement result and metadata
|
| 52 |
"""
|
| 53 |
-
if semaphore:
|
| 54 |
-
async with semaphore:
|
| 55 |
-
return await _evaluate_single_response_impl(
|
| 56 |
-
question, response, correct_answer, model
|
| 57 |
-
)
|
| 58 |
-
else:
|
| 59 |
-
return await _evaluate_single_response_impl(
|
| 60 |
-
question, response, correct_answer, model
|
| 61 |
-
)
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
async def _evaluate_single_response_impl(
|
| 65 |
-
question: str, response: str, correct_answer: str, model: str
|
| 66 |
-
) -> Dict[str, Any]:
|
| 67 |
-
"""Internal implementation of single response evaluation"""
|
| 68 |
-
|
| 69 |
prompt = GRADER_TEMPLATE.format(
|
| 70 |
question=question, response=response, correct_answer=correct_answer
|
| 71 |
)
|
| 72 |
|
| 73 |
# Use litellm with structured output
|
| 74 |
-
|
| 75 |
model=model,
|
| 76 |
messages=[
|
| 77 |
{
|
|
@@ -86,12 +73,12 @@ async def _evaluate_single_response_impl(
|
|
| 86 |
|
| 87 |
# Parse structured output
|
| 88 |
result: JudgementResult = JudgementResult.model_validate_json(
|
| 89 |
-
|
| 90 |
)
|
| 91 |
return result
|
| 92 |
|
| 93 |
|
| 94 |
-
|
| 95 |
input_file: str,
|
| 96 |
eval_file: str,
|
| 97 |
output_file: str = "evaluation_results.jsonl",
|
|
@@ -106,65 +93,77 @@ async def evaluate_dataset(
|
|
| 106 |
input_file: Path to input JSONL file with QA pairs
|
| 107 |
output_file: Path to output JSONL file for results
|
| 108 |
model: The LLM model to use for judging
|
| 109 |
-
max_concurrent: Maximum number of concurrent
|
| 110 |
limit: Optional limit on number of examples to evaluate
|
| 111 |
"""
|
| 112 |
-
|
|
|
|
|
|
|
|
|
|
| 113 |
if limit:
|
| 114 |
to_evaluate = to_evaluate[:limit]
|
| 115 |
|
| 116 |
print(f"Loaded {len(to_evaluate)} QA pairs to evaluate")
|
| 117 |
|
| 118 |
-
# Load dataset
|
| 119 |
print(f"Loading ground truth from {eval_file}...")
|
| 120 |
with open(eval_file, "r") as f:
|
| 121 |
-
ground_truths = [
|
| 122 |
|
| 123 |
print(f"Loaded {len(ground_truths)} ground truths")
|
| 124 |
|
| 125 |
-
#
|
| 126 |
-
semaphore = asyncio.Semaphore(max_concurrent)
|
| 127 |
-
|
| 128 |
-
# Create evaluation tasks
|
| 129 |
-
tasks = []
|
| 130 |
-
for qa_pair, ground_truth in zip(to_evaluate, ground_truths):
|
| 131 |
-
question = ground_truth.get("question", "")
|
| 132 |
-
ground_truth = ground_truth.get("solution", "")
|
| 133 |
-
response = qa_pair.get("solution", "")
|
| 134 |
-
|
| 135 |
-
task = evaluate_single_response(
|
| 136 |
-
response=response,
|
| 137 |
-
question=question,
|
| 138 |
-
correct_answer=ground_truth,
|
| 139 |
-
model=model,
|
| 140 |
-
semaphore=semaphore,
|
| 141 |
-
)
|
| 142 |
-
tasks.append(task)
|
| 143 |
-
|
| 144 |
-
# Run evaluations in parallel
|
| 145 |
print(f"Running evaluations with {max_concurrent} parallel workers...")
|
| 146 |
-
results =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
|
| 148 |
-
|
| 149 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
correct_count = 0
|
| 151 |
error_count = 0
|
| 152 |
|
| 153 |
for qa_pair, result in zip(to_evaluate, results):
|
| 154 |
print(result.model_dump_json())
|
| 155 |
-
|
| 156 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
|
| 158 |
if result.correct == Correctness.yes:
|
| 159 |
correct_count += 1
|
| 160 |
else:
|
| 161 |
error_count += 1
|
| 162 |
|
| 163 |
-
#
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
|
| 169 |
# Print summary
|
| 170 |
total = len(to_evaluate)
|
|
@@ -186,17 +185,17 @@ async def evaluate_dataset(
|
|
| 186 |
#
|
| 187 |
|
| 188 |
|
| 189 |
-
|
| 190 |
"""Main entry point for the evaluation script"""
|
| 191 |
-
|
| 192 |
-
input_file="qa_pairs.jsonl",
|
| 193 |
-
eval_file="qa_pairs.jsonl",
|
| 194 |
output_file="evaluation_results.jsonl",
|
| 195 |
model="gpt-4o-mini",
|
| 196 |
max_concurrent=30,
|
| 197 |
-
limit=
|
| 198 |
)
|
| 199 |
|
| 200 |
|
| 201 |
if __name__ == "__main__":
|
| 202 |
-
|
|
|
|
|
|
|
| 1 |
import json
|
| 2 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 3 |
from typing import Any, Dict
|
| 4 |
|
| 5 |
import litellm
|
| 6 |
+
from models import (
|
| 7 |
+
Correctness,
|
| 8 |
+
EvaluatedQuestionAndSolution,
|
| 9 |
+
JudgementResult,
|
| 10 |
+
QuestionAndSolution,
|
| 11 |
+
)
|
| 12 |
|
| 13 |
# from: https://github.com/centerforaisafety/hle/blob/7b6be5aad6f9b43af3857de7867f3b52f6e4acb3/hle_eval/run_judge_results.py#L16-L33
|
| 14 |
GRADER_TEMPLATE = """
|
|
|
|
| 35 |
CHOICE_STRINGS = ["yes", "no"]
|
| 36 |
|
| 37 |
|
| 38 |
+
def evaluate_single_response(
|
| 39 |
question: str,
|
| 40 |
response: str,
|
| 41 |
correct_answer: str,
|
| 42 |
model: str = "gpt-4o-mini",
|
|
|
|
| 43 |
) -> Dict[str, Any]:
|
| 44 |
"""
|
| 45 |
Evaluate a single response against the ground truth using LLM as judge.
|
|
|
|
| 49 |
response: The response to evaluate
|
| 50 |
correct_answer: The ground truth answer
|
| 51 |
model: The LLM model to use for judging
|
|
|
|
| 52 |
|
| 53 |
Returns:
|
| 54 |
Dictionary containing the judgement result and metadata
|
| 55 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
prompt = GRADER_TEMPLATE.format(
|
| 57 |
question=question, response=response, correct_answer=correct_answer
|
| 58 |
)
|
| 59 |
|
| 60 |
# Use litellm with structured output
|
| 61 |
+
llm_response = litellm.completion(
|
| 62 |
model=model,
|
| 63 |
messages=[
|
| 64 |
{
|
|
|
|
| 73 |
|
| 74 |
# Parse structured output
|
| 75 |
result: JudgementResult = JudgementResult.model_validate_json(
|
| 76 |
+
llm_response.choices[0].message.content
|
| 77 |
)
|
| 78 |
return result
|
| 79 |
|
| 80 |
|
| 81 |
+
def evaluate_dataset(
|
| 82 |
input_file: str,
|
| 83 |
eval_file: str,
|
| 84 |
output_file: str = "evaluation_results.jsonl",
|
|
|
|
| 93 |
input_file: Path to input JSONL file with QA pairs
|
| 94 |
output_file: Path to output JSONL file for results
|
| 95 |
model: The LLM model to use for judging
|
| 96 |
+
max_concurrent: Maximum number of concurrent threads
|
| 97 |
limit: Optional limit on number of examples to evaluate
|
| 98 |
"""
|
| 99 |
+
# Load input data as proper models
|
| 100 |
+
to_evaluate = [
|
| 101 |
+
QuestionAndSolution.model_validate_json(line) for line in open(input_file, "r")
|
| 102 |
+
]
|
| 103 |
if limit:
|
| 104 |
to_evaluate = to_evaluate[:limit]
|
| 105 |
|
| 106 |
print(f"Loaded {len(to_evaluate)} QA pairs to evaluate")
|
| 107 |
|
| 108 |
+
# Load ground truth dataset
|
| 109 |
print(f"Loading ground truth from {eval_file}...")
|
| 110 |
with open(eval_file, "r") as f:
|
| 111 |
+
ground_truths = [QuestionAndSolution.model_validate_json(line) for line in f]
|
| 112 |
|
| 113 |
print(f"Loaded {len(ground_truths)} ground truths")
|
| 114 |
|
| 115 |
+
# Run evaluations in parallel using ThreadPoolExecutor
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
print(f"Running evaluations with {max_concurrent} parallel workers...")
|
| 117 |
+
results = []
|
| 118 |
+
|
| 119 |
+
with ThreadPoolExecutor(max_workers=max_concurrent) as executor:
|
| 120 |
+
# Submit all tasks
|
| 121 |
+
future_to_idx = {}
|
| 122 |
+
for idx, (qa_pair, ground_truth) in enumerate(zip(to_evaluate, ground_truths)):
|
| 123 |
+
question = ground_truth.question
|
| 124 |
+
ground_truth_answer = ground_truth.solution
|
| 125 |
+
response = qa_pair.solution
|
| 126 |
+
|
| 127 |
+
future = executor.submit(
|
| 128 |
+
evaluate_single_response,
|
| 129 |
+
response=response,
|
| 130 |
+
question=question,
|
| 131 |
+
correct_answer=ground_truth_answer,
|
| 132 |
+
model=model,
|
| 133 |
+
)
|
| 134 |
+
future_to_idx[future] = idx
|
| 135 |
|
| 136 |
+
# Collect results in order
|
| 137 |
+
results = [None] * len(to_evaluate)
|
| 138 |
+
for future in as_completed(future_to_idx):
|
| 139 |
+
idx = future_to_idx[future]
|
| 140 |
+
results[idx] = future.result()
|
| 141 |
+
|
| 142 |
+
# Combine results with original data using proper models
|
| 143 |
+
output_data: list[EvaluatedQuestionAndSolution] = []
|
| 144 |
correct_count = 0
|
| 145 |
error_count = 0
|
| 146 |
|
| 147 |
for qa_pair, result in zip(to_evaluate, results):
|
| 148 |
print(result.model_dump_json())
|
| 149 |
+
|
| 150 |
+
# Create proper evaluated model
|
| 151 |
+
output_entry = EvaluatedQuestionAndSolution(
|
| 152 |
+
**qa_pair.model_dump(),
|
| 153 |
+
evaluation=result
|
| 154 |
+
)
|
| 155 |
+
output_data.append(output_entry)
|
| 156 |
|
| 157 |
if result.correct == Correctness.yes:
|
| 158 |
correct_count += 1
|
| 159 |
else:
|
| 160 |
error_count += 1
|
| 161 |
|
| 162 |
+
# Write results using proper model serialization
|
| 163 |
+
print(f"Writing results to {output_file}...")
|
| 164 |
+
with open(output_file, "w") as f:
|
| 165 |
+
for entry in output_data:
|
| 166 |
+
f.write(entry.model_dump_json() + "\n")
|
| 167 |
|
| 168 |
# Print summary
|
| 169 |
total = len(to_evaluate)
|
|
|
|
| 185 |
#
|
| 186 |
|
| 187 |
|
| 188 |
+
def main():
|
| 189 |
"""Main entry point for the evaluation script"""
|
| 190 |
+
evaluate_dataset(
|
| 191 |
+
input_file="eval/qa_pairs.jsonl",
|
| 192 |
+
eval_file="eval/qa_pairs.jsonl",
|
| 193 |
output_file="evaluation_results.jsonl",
|
| 194 |
model="gpt-4o-mini",
|
| 195 |
max_concurrent=30,
|
| 196 |
+
limit=10, # Set to None to evaluate all, or a number to limit
|
| 197 |
)
|
| 198 |
|
| 199 |
|
| 200 |
if __name__ == "__main__":
|
| 201 |
+
main()
|
eval/models.py
CHANGED
|
@@ -55,3 +55,9 @@ class EvaluationResult(BaseModel):
|
|
| 55 |
success: bool
|
| 56 |
judgement: JudgementResult | None = None
|
| 57 |
error: str | None = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
success: bool
|
| 56 |
judgement: JudgementResult | None = None
|
| 57 |
error: str | None = None
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
class EvaluatedQuestionAndSolution(QuestionAndSolution):
|
| 61 |
+
"""Model for a QA pair with its evaluation result"""
|
| 62 |
+
|
| 63 |
+
evaluation: JudgementResult
|
evaluation_results.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|