modelbuilderhq commited on
Commit
220e9f3
·
verified ·
1 Parent(s): 1305932

Upload folder using huggingface_hub

Browse files
inference.py CHANGED
@@ -32,6 +32,8 @@ MAX_STEPS = int(os.getenv("MAX_STEPS", "8"))
32
  TEMPERATURE = float(os.getenv("TEMPERATURE", "0"))
33
  MAX_TOKENS = int(os.getenv("MAX_TOKENS", "300"))
34
  SUCCESS_SCORE_THRESHOLD = float(os.getenv("SUCCESS_SCORE_THRESHOLD", "0.1"))
 
 
35
 
36
  SYSTEM_PROMPT = """You are a support operations agent solving one triage ticket.
37
  Return exactly one JSON object with this schema:
@@ -181,6 +183,10 @@ def _log_end(success: bool, steps: int, score: float, rewards: list[float]) -> N
181
  )
182
 
183
 
 
 
 
 
184
  def _run_local_episode(task_id: str, client: OpenAI | None) -> EpisodeResult:
185
  env = SupportDeskEnvironment(task_id=task_id)
186
  observation = env.reset()
@@ -280,8 +286,8 @@ async def main() -> None:
280
  episode = await _run_docker_episode(TASK_NAME, client)
281
  else:
282
  episode = _run_local_episode(TASK_NAME, client)
283
- final_score = episode.final_score
284
- success = episode.final_score >= SUCCESS_SCORE_THRESHOLD
285
  steps_taken = episode.steps_taken
286
  rewards = episode.rewards
287
  finally:
 
32
  TEMPERATURE = float(os.getenv("TEMPERATURE", "0"))
33
  MAX_TOKENS = int(os.getenv("MAX_TOKENS", "300"))
34
  SUCCESS_SCORE_THRESHOLD = float(os.getenv("SUCCESS_SCORE_THRESHOLD", "0.1"))
35
+ SUBMISSION_SCORE_MIN = 0.01
36
+ SUBMISSION_SCORE_MAX = 0.99
37
 
38
  SYSTEM_PROMPT = """You are a support operations agent solving one triage ticket.
39
  Return exactly one JSON object with this schema:
 
183
  )
184
 
185
 
186
+ def _submission_score(score: float) -> float:
187
+ return max(SUBMISSION_SCORE_MIN, min(SUBMISSION_SCORE_MAX, score))
188
+
189
+
190
  def _run_local_episode(task_id: str, client: OpenAI | None) -> EpisodeResult:
191
  env = SupportDeskEnvironment(task_id=task_id)
192
  observation = env.reset()
 
286
  episode = await _run_docker_episode(TASK_NAME, client)
287
  else:
288
  episode = _run_local_episode(TASK_NAME, client)
289
+ final_score = _submission_score(episode.final_score)
290
+ success = final_score >= SUCCESS_SCORE_THRESHOLD
291
  steps_taken = episode.steps_taken
292
  rewards = episode.rewards
293
  finally:
openenv.yaml CHANGED
@@ -6,3 +6,12 @@ runtime: fastapi
6
  app: supportdesk_env.server.app:app
7
  port: 8000
8
  description: Enterprise support operations environment with SLA pressure, business-impact aware triage, and primary-vs-secondary issue prioritization.
 
 
 
 
 
 
 
 
 
 
6
  app: supportdesk_env.server.app:app
7
  port: 8000
8
  description: Enterprise support operations environment with SLA pressure, business-impact aware triage, and primary-vs-secondary issue prioritization.
9
+ tasks:
10
+ - id: billing_refund_easy
11
+ grader: supportdesk_env.graders:BillingRefundEasyGrader
12
+ - id: account_takeover_medium
13
+ grader: supportdesk_env.graders:AccountTakeoverMediumGrader
14
+ - id: api_incident_hard
15
+ grader: supportdesk_env.graders:ApiIncidentHardGrader
16
+ - id: regulated_export_exception_hard
17
+ grader: supportdesk_env.graders:RegulatedExportExceptionHardGrader
supportdesk_env/graders.py CHANGED
@@ -8,7 +8,7 @@ from dataclasses import dataclass
8
  from supportdesk_env.models import SupportCaseProgress
9
  from supportdesk_env.tasks import SupportTaskSpec, get_task
10
 
11
- STRICT_SCORE_EPSILON = 0.001
12
 
13
 
14
  @dataclass(frozen=True)
@@ -137,3 +137,31 @@ def grade_task_id(task_id: str, case: SupportCaseProgress) -> GradeBreakdown:
137
  """Convenience wrapper used by tests and evaluation scripts."""
138
 
139
  return grade_case(get_task(task_id), case)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  from supportdesk_env.models import SupportCaseProgress
9
  from supportdesk_env.tasks import SupportTaskSpec, get_task
10
 
11
+ STRICT_SCORE_EPSILON = 0.01
12
 
13
 
14
  @dataclass(frozen=True)
 
137
  """Convenience wrapper used by tests and evaluation scripts."""
138
 
139
  return grade_case(get_task(task_id), case)
140
+
141
+
142
+ class _TaskSpecificGrader:
143
+ """Importable task-specific grader wrapper for validator task discovery."""
144
+
145
+ task_id: str = ""
146
+
147
+ def grade(self, case: SupportCaseProgress) -> float:
148
+ return grade_task_id(self.task_id, case).total_score
149
+
150
+ def __call__(self, case: SupportCaseProgress) -> float:
151
+ return self.grade(case)
152
+
153
+
154
+ class BillingRefundEasyGrader(_TaskSpecificGrader):
155
+ task_id = "billing_refund_easy"
156
+
157
+
158
+ class AccountTakeoverMediumGrader(_TaskSpecificGrader):
159
+ task_id = "account_takeover_medium"
160
+
161
+
162
+ class ApiIncidentHardGrader(_TaskSpecificGrader):
163
+ task_id = "api_incident_hard"
164
+
165
+
166
+ class RegulatedExportExceptionHardGrader(_TaskSpecificGrader):
167
+ task_id = "regulated_export_exception_hard"
tests/test_supportdesk.py CHANGED
@@ -66,7 +66,7 @@ def test_perfect_solution_grades_full_score():
66
  )
67
 
68
  breakdown = grade_case(task, env.state.case)
69
- assert breakdown.total_score == 0.999
70
 
71
 
72
  def test_max_steps_ends_episode():
@@ -86,6 +86,26 @@ def test_grade_is_bounded_between_zero_and_one():
86
  assert 0.0 < breakdown.total_score < 1.0
87
 
88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  def test_state_includes_episode_id_after_reset():
90
  env = SupportDeskEnvironment(task_id="billing_refund_easy")
91
  env.reset(episode_id="episode-123")
 
66
  )
67
 
68
  breakdown = grade_case(task, env.state.case)
69
+ assert breakdown.total_score == 0.99
70
 
71
 
72
  def test_max_steps_ends_episode():
 
86
  assert 0.0 < breakdown.total_score < 1.0
87
 
88
 
89
+ def test_task_specific_graders_are_importable_and_clamped():
90
+ from supportdesk_env.graders import (
91
+ AccountTakeoverMediumGrader,
92
+ ApiIncidentHardGrader,
93
+ BillingRefundEasyGrader,
94
+ RegulatedExportExceptionHardGrader,
95
+ )
96
+ from supportdesk_env.models import SupportCaseProgress
97
+
98
+ case = SupportCaseProgress()
99
+ scores = [
100
+ BillingRefundEasyGrader().grade(case),
101
+ AccountTakeoverMediumGrader().grade(case),
102
+ ApiIncidentHardGrader().grade(case),
103
+ RegulatedExportExceptionHardGrader().grade(case),
104
+ ]
105
+
106
+ assert scores == [0.15, 0.01, 0.01, 0.01]
107
+
108
+
109
  def test_state_includes_episode_id_after_reset():
110
  env = SupportDeskEnvironment(task_id="billing_refund_easy")
111
  env.reset(episode_id="episode-123")