Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- inference.py +8 -2
- openenv.yaml +9 -0
- supportdesk_env/graders.py +29 -1
- tests/test_supportdesk.py +21 -1
inference.py
CHANGED
|
@@ -32,6 +32,8 @@ MAX_STEPS = int(os.getenv("MAX_STEPS", "8"))
|
|
| 32 |
TEMPERATURE = float(os.getenv("TEMPERATURE", "0"))
|
| 33 |
MAX_TOKENS = int(os.getenv("MAX_TOKENS", "300"))
|
| 34 |
SUCCESS_SCORE_THRESHOLD = float(os.getenv("SUCCESS_SCORE_THRESHOLD", "0.1"))
|
|
|
|
|
|
|
| 35 |
|
| 36 |
SYSTEM_PROMPT = """You are a support operations agent solving one triage ticket.
|
| 37 |
Return exactly one JSON object with this schema:
|
|
@@ -181,6 +183,10 @@ def _log_end(success: bool, steps: int, score: float, rewards: list[float]) -> N
|
|
| 181 |
)
|
| 182 |
|
| 183 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
def _run_local_episode(task_id: str, client: OpenAI | None) -> EpisodeResult:
|
| 185 |
env = SupportDeskEnvironment(task_id=task_id)
|
| 186 |
observation = env.reset()
|
|
@@ -280,8 +286,8 @@ async def main() -> None:
|
|
| 280 |
episode = await _run_docker_episode(TASK_NAME, client)
|
| 281 |
else:
|
| 282 |
episode = _run_local_episode(TASK_NAME, client)
|
| 283 |
-
final_score = episode.final_score
|
| 284 |
-
success =
|
| 285 |
steps_taken = episode.steps_taken
|
| 286 |
rewards = episode.rewards
|
| 287 |
finally:
|
|
|
|
| 32 |
TEMPERATURE = float(os.getenv("TEMPERATURE", "0"))
|
| 33 |
MAX_TOKENS = int(os.getenv("MAX_TOKENS", "300"))
|
| 34 |
SUCCESS_SCORE_THRESHOLD = float(os.getenv("SUCCESS_SCORE_THRESHOLD", "0.1"))
|
| 35 |
+
SUBMISSION_SCORE_MIN = 0.01
|
| 36 |
+
SUBMISSION_SCORE_MAX = 0.99
|
| 37 |
|
| 38 |
SYSTEM_PROMPT = """You are a support operations agent solving one triage ticket.
|
| 39 |
Return exactly one JSON object with this schema:
|
|
|
|
| 183 |
)
|
| 184 |
|
| 185 |
|
| 186 |
+
def _submission_score(score: float) -> float:
|
| 187 |
+
return max(SUBMISSION_SCORE_MIN, min(SUBMISSION_SCORE_MAX, score))
|
| 188 |
+
|
| 189 |
+
|
| 190 |
def _run_local_episode(task_id: str, client: OpenAI | None) -> EpisodeResult:
|
| 191 |
env = SupportDeskEnvironment(task_id=task_id)
|
| 192 |
observation = env.reset()
|
|
|
|
| 286 |
episode = await _run_docker_episode(TASK_NAME, client)
|
| 287 |
else:
|
| 288 |
episode = _run_local_episode(TASK_NAME, client)
|
| 289 |
+
final_score = _submission_score(episode.final_score)
|
| 290 |
+
success = final_score >= SUCCESS_SCORE_THRESHOLD
|
| 291 |
steps_taken = episode.steps_taken
|
| 292 |
rewards = episode.rewards
|
| 293 |
finally:
|
openenv.yaml
CHANGED
|
@@ -6,3 +6,12 @@ runtime: fastapi
|
|
| 6 |
app: supportdesk_env.server.app:app
|
| 7 |
port: 8000
|
| 8 |
description: Enterprise support operations environment with SLA pressure, business-impact aware triage, and primary-vs-secondary issue prioritization.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
app: supportdesk_env.server.app:app
|
| 7 |
port: 8000
|
| 8 |
description: Enterprise support operations environment with SLA pressure, business-impact aware triage, and primary-vs-secondary issue prioritization.
|
| 9 |
+
tasks:
|
| 10 |
+
- id: billing_refund_easy
|
| 11 |
+
grader: supportdesk_env.graders:BillingRefundEasyGrader
|
| 12 |
+
- id: account_takeover_medium
|
| 13 |
+
grader: supportdesk_env.graders:AccountTakeoverMediumGrader
|
| 14 |
+
- id: api_incident_hard
|
| 15 |
+
grader: supportdesk_env.graders:ApiIncidentHardGrader
|
| 16 |
+
- id: regulated_export_exception_hard
|
| 17 |
+
grader: supportdesk_env.graders:RegulatedExportExceptionHardGrader
|
supportdesk_env/graders.py
CHANGED
|
@@ -8,7 +8,7 @@ from dataclasses import dataclass
|
|
| 8 |
from supportdesk_env.models import SupportCaseProgress
|
| 9 |
from supportdesk_env.tasks import SupportTaskSpec, get_task
|
| 10 |
|
| 11 |
-
STRICT_SCORE_EPSILON = 0.
|
| 12 |
|
| 13 |
|
| 14 |
@dataclass(frozen=True)
|
|
@@ -137,3 +137,31 @@ def grade_task_id(task_id: str, case: SupportCaseProgress) -> GradeBreakdown:
|
|
| 137 |
"""Convenience wrapper used by tests and evaluation scripts."""
|
| 138 |
|
| 139 |
return grade_case(get_task(task_id), case)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
from supportdesk_env.models import SupportCaseProgress
|
| 9 |
from supportdesk_env.tasks import SupportTaskSpec, get_task
|
| 10 |
|
| 11 |
+
STRICT_SCORE_EPSILON = 0.01
|
| 12 |
|
| 13 |
|
| 14 |
@dataclass(frozen=True)
|
|
|
|
| 137 |
"""Convenience wrapper used by tests and evaluation scripts."""
|
| 138 |
|
| 139 |
return grade_case(get_task(task_id), case)
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
class _TaskSpecificGrader:
|
| 143 |
+
"""Importable task-specific grader wrapper for validator task discovery."""
|
| 144 |
+
|
| 145 |
+
task_id: str = ""
|
| 146 |
+
|
| 147 |
+
def grade(self, case: SupportCaseProgress) -> float:
|
| 148 |
+
return grade_task_id(self.task_id, case).total_score
|
| 149 |
+
|
| 150 |
+
def __call__(self, case: SupportCaseProgress) -> float:
|
| 151 |
+
return self.grade(case)
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
class BillingRefundEasyGrader(_TaskSpecificGrader):
|
| 155 |
+
task_id = "billing_refund_easy"
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
class AccountTakeoverMediumGrader(_TaskSpecificGrader):
|
| 159 |
+
task_id = "account_takeover_medium"
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
class ApiIncidentHardGrader(_TaskSpecificGrader):
|
| 163 |
+
task_id = "api_incident_hard"
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
class RegulatedExportExceptionHardGrader(_TaskSpecificGrader):
|
| 167 |
+
task_id = "regulated_export_exception_hard"
|
tests/test_supportdesk.py
CHANGED
|
@@ -66,7 +66,7 @@ def test_perfect_solution_grades_full_score():
|
|
| 66 |
)
|
| 67 |
|
| 68 |
breakdown = grade_case(task, env.state.case)
|
| 69 |
-
assert breakdown.total_score == 0.
|
| 70 |
|
| 71 |
|
| 72 |
def test_max_steps_ends_episode():
|
|
@@ -86,6 +86,26 @@ def test_grade_is_bounded_between_zero_and_one():
|
|
| 86 |
assert 0.0 < breakdown.total_score < 1.0
|
| 87 |
|
| 88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
def test_state_includes_episode_id_after_reset():
|
| 90 |
env = SupportDeskEnvironment(task_id="billing_refund_easy")
|
| 91 |
env.reset(episode_id="episode-123")
|
|
|
|
| 66 |
)
|
| 67 |
|
| 68 |
breakdown = grade_case(task, env.state.case)
|
| 69 |
+
assert breakdown.total_score == 0.99
|
| 70 |
|
| 71 |
|
| 72 |
def test_max_steps_ends_episode():
|
|
|
|
| 86 |
assert 0.0 < breakdown.total_score < 1.0
|
| 87 |
|
| 88 |
|
| 89 |
+
def test_task_specific_graders_are_importable_and_clamped():
|
| 90 |
+
from supportdesk_env.graders import (
|
| 91 |
+
AccountTakeoverMediumGrader,
|
| 92 |
+
ApiIncidentHardGrader,
|
| 93 |
+
BillingRefundEasyGrader,
|
| 94 |
+
RegulatedExportExceptionHardGrader,
|
| 95 |
+
)
|
| 96 |
+
from supportdesk_env.models import SupportCaseProgress
|
| 97 |
+
|
| 98 |
+
case = SupportCaseProgress()
|
| 99 |
+
scores = [
|
| 100 |
+
BillingRefundEasyGrader().grade(case),
|
| 101 |
+
AccountTakeoverMediumGrader().grade(case),
|
| 102 |
+
ApiIncidentHardGrader().grade(case),
|
| 103 |
+
RegulatedExportExceptionHardGrader().grade(case),
|
| 104 |
+
]
|
| 105 |
+
|
| 106 |
+
assert scores == [0.15, 0.01, 0.01, 0.01]
|
| 107 |
+
|
| 108 |
+
|
| 109 |
def test_state_includes_episode_id_after_reset():
|
| 110 |
env = SupportDeskEnvironment(task_id="billing_refund_easy")
|
| 111 |
env.reset(episode_id="episode-123")
|