modelbuilderhq commited on
Commit
551c5bc
·
verified ·
1 Parent(s): 8c82a6f

Upload folder using huggingface_hub

Browse files
check_graders.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import yaml
2
+
3
+ d = yaml.safe_load(open('"'"'openenv.yaml'"'"'))
4
+ for t in d.get('"'"'tasks'"'"', []):
5
+ g = t.get('"'"'grader'"'"')
6
+ print(f"{t['"'"'id'"'"']}: grader type = {type(g).__name__}, value = {g}")
check_yaml.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ import yaml
2
+
3
+ with open('openenv.yaml') as f:
4
+ d = yaml.safe_load(f)
5
+
6
+ for t in d.get("tasks", []):
7
+ g = t.get("grader")
8
+ print(f"{t['id']}: grader type = {type(g).__name__}, value = {g}")
openenv.yaml CHANGED
@@ -15,13 +15,37 @@ endpoints:
15
  tasks:
16
  - id: billing_refund_easy
17
  difficulty: easy
18
- grader: graders:BillingRefundEasyGrader
 
 
 
 
 
 
19
  - id: account_takeover_medium
20
  difficulty: medium
21
- grader: graders:AccountTakeoverMediumGrader
 
 
 
 
 
 
22
  - id: api_incident_hard
23
  difficulty: hard
24
- grader: graders:ApiIncidentHardGrader
 
 
 
 
 
 
25
  - id: regulated_export_exception_hard
26
  difficulty: hard
27
- grader: graders:RegulatedExportExceptionHardGrader
 
 
 
 
 
 
 
15
  tasks:
16
  - id: billing_refund_easy
17
  difficulty: easy
18
+ max_steps: 6
19
+ grader:
20
+ type: llm
21
+ prompt_template: |
22
+ You are a QA judge. Score the agent's handling of a duplicate-charge refund ticket from 0.0 to 1.0.
23
+ Full score if: queue=billing_ops, priority=high, issue_type=duplicate_charge, status=resolved,
24
+ reply confirms refund with reference/timeline, and internal note mentions duplicate charge/refund.
25
  - id: account_takeover_medium
26
  difficulty: medium
27
+ max_steps: 7
28
+ grader:
29
+ type: llm
30
+ prompt_template: |
31
+ You are a QA judge. Score 0.0–1.0 for account-compromise recovery.
32
+ Full score if: account locked, sessions invalidated, ownership verification requested, queue=trust_and_safety,
33
+ status escalated/waiting_on_customer, and reply is security-appropriate (no promises).
34
  - id: api_incident_hard
35
  difficulty: hard
36
+ max_steps: 8
37
+ grader:
38
+ type: llm
39
+ prompt_template: |
40
+ You are a QA judge. Score 0.0–1.0 for production API incident triage.
41
+ Full score if: request IDs/time window collected, status page mentioned, escalation to platform_engineering,
42
+ queue=platform_engineering, and reply acknowledges incident.
43
  - id: regulated_export_exception_hard
44
  difficulty: hard
45
+ max_steps: 8
46
+ grader:
47
+ type: llm
48
+ prompt_template: |
49
+ You are a QA judge. Score 0.0–1.0 for regulated export exception handling.
50
+ Full score if: compliance review required, no promise of approval, recipient identity/duration/safeguards requested,
51
+ queue=compliance_ops, status=waiting_on_customer.
temp_check.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import yaml
2
+
3
+ d = yaml.safe_load(open('"'"'openenv.yaml'"'"'))
4
+ for t in d.get('"'"'tasks'"'"', []):
5
+ g = t.get('"'"'grader'"'"')
6
+ print(f"{t['"'"'id'"'"']}: grader type = {type(g).__name__}, value = {g}")
tests/test_supportdesk.py CHANGED
@@ -117,15 +117,9 @@ def test_openenv_manifest_graders_are_importable():
117
 
118
  for task in manifest["tasks"]:
119
  grader_block = task["grader"]
120
- if isinstance(grader_block, dict):
121
- grader_path = grader_block["path"]
122
- else:
123
- grader_path = grader_block
124
- module_name, object_name = grader_path.split(":", 1)
125
- module = importlib.import_module(module_name)
126
- grader_cls = getattr(module, object_name)
127
- score = grader_cls().grade(SupportCaseProgress())
128
- assert 0.0 < score < 1.0
129
 
130
 
131
  def test_state_includes_episode_id_after_reset():
 
117
 
118
  for task in manifest["tasks"]:
119
  grader_block = task["grader"]
120
+ assert isinstance(grader_block, dict)
121
+ assert grader_block.get("type") == "llm"
122
+ assert isinstance(grader_block.get("prompt_template"), str)
 
 
 
 
 
 
123
 
124
 
125
  def test_state_includes_episode_id_after_reset():