Jayant-Kernel Claude Sonnet 4.6 commited on
feat: add 429 retry wrapper to grader semantic check
Browse filesCo-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
- src/deceit_env/server/grader.py +21 -6
- tests/test_grader.py +27 -0
src/deceit_env/server/grader.py
CHANGED
|
@@ -11,6 +11,7 @@ import hashlib
|
|
| 11 |
import json
|
| 12 |
import re
|
| 13 |
import pathlib
|
|
|
|
| 14 |
from dataclasses import dataclass
|
| 15 |
|
| 16 |
import os
|
|
@@ -93,12 +94,26 @@ class Grader:
|
|
| 93 |
f"Is '{answer}' semantically equivalent to '{ground_truth}'? "
|
| 94 |
"Reply YES or NO only."
|
| 95 |
)
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
verdict = response.choices[0].message.content.strip().upper()
|
| 103 |
correct = verdict.startswith("YES")
|
| 104 |
|
|
|
|
| 11 |
import json
|
| 12 |
import re
|
| 13 |
import pathlib
|
| 14 |
+
import time
|
| 15 |
from dataclasses import dataclass
|
| 16 |
|
| 17 |
import os
|
|
|
|
| 94 |
f"Is '{answer}' semantically equivalent to '{ground_truth}'? "
|
| 95 |
"Reply YES or NO only."
|
| 96 |
)
|
| 97 |
+
|
| 98 |
+
max_retries = 3
|
| 99 |
+
for attempt in range(max_retries):
|
| 100 |
+
try:
|
| 101 |
+
response = client.chat.completions.create(
|
| 102 |
+
model="gpt-4o-mini",
|
| 103 |
+
messages=[{"role": "user", "content": prompt}],
|
| 104 |
+
max_tokens=5,
|
| 105 |
+
temperature=0,
|
| 106 |
+
)
|
| 107 |
+
break
|
| 108 |
+
except Exception as e:
|
| 109 |
+
if "429" in str(e) or "RateLimitError" in type(e).__name__:
|
| 110 |
+
print(f"[grader] Rate limit hit (attempt {attempt + 1}/{max_retries}), waiting 25s...")
|
| 111 |
+
time.sleep(25)
|
| 112 |
+
if attempt == max_retries - 1:
|
| 113 |
+
raise
|
| 114 |
+
else:
|
| 115 |
+
raise
|
| 116 |
+
|
| 117 |
verdict = response.choices[0].message.content.strip().upper()
|
| 118 |
correct = verdict.startswith("YES")
|
| 119 |
|
tests/test_grader.py
CHANGED
|
@@ -100,3 +100,30 @@ class TestSemanticMatch:
|
|
| 100 |
def test_error_raised_without_api_key(self, tmp_grader):
|
| 101 |
with pytest.raises(RuntimeError, match="no OpenAI API key"):
|
| 102 |
tmp_grader.check("Sydney", "Canberra")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
def test_error_raised_without_api_key(self, tmp_grader):
|
| 101 |
with pytest.raises(RuntimeError, match="no OpenAI API key"):
|
| 102 |
tmp_grader.check("Sydney", "Canberra")
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
class TestRateLimitRetry:
|
| 106 |
+
def test_retries_on_429_then_succeeds(self, api_grader):
|
| 107 |
+
from openai import RateLimitError
|
| 108 |
+
import httpx
|
| 109 |
+
|
| 110 |
+
mock_client = MagicMock()
|
| 111 |
+
mock_choice = MagicMock()
|
| 112 |
+
mock_choice.message.content = "YES"
|
| 113 |
+
ok_response = MagicMock()
|
| 114 |
+
ok_response.choices = [mock_choice]
|
| 115 |
+
|
| 116 |
+
raw_response = MagicMock()
|
| 117 |
+
raw_response.headers = {}
|
| 118 |
+
raw_response.status_code = 429
|
| 119 |
+
_dummy_request = httpx.Request("POST", "https://api.openai.com/v1/chat/completions")
|
| 120 |
+
rate_err = RateLimitError("rate limited", response=httpx.Response(429, request=_dummy_request), body={})
|
| 121 |
+
mock_client.chat.completions.create.side_effect = [rate_err, ok_response]
|
| 122 |
+
|
| 123 |
+
with patch("deceit_env.server.grader.OpenAI", return_value=mock_client):
|
| 124 |
+
with patch("time.sleep") as mock_sleep:
|
| 125 |
+
result = api_grader.check("The Australian capital", "Canberra")
|
| 126 |
+
|
| 127 |
+
assert result.correct is True
|
| 128 |
+
assert mock_client.chat.completions.create.call_count == 2
|
| 129 |
+
mock_sleep.assert_called_once_with(25)
|