Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- check_graders.py +6 -0
- check_yaml.py +8 -0
- openenv.yaml +28 -4
- temp_check.py +6 -0
- tests/test_supportdesk.py +3 -9
check_graders.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import yaml
|
| 2 |
+
|
| 3 |
+
d = yaml.safe_load(open('"'"'openenv.yaml'"'"'))
|
| 4 |
+
for t in d.get('"'"'tasks'"'"', []):
|
| 5 |
+
g = t.get('"'"'grader'"'"')
|
| 6 |
+
print(f"{t['"'"'id'"'"']}: grader type = {type(g).__name__}, value = {g}")
|
check_yaml.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import yaml
|
| 2 |
+
|
| 3 |
+
with open('openenv.yaml') as f:
|
| 4 |
+
d = yaml.safe_load(f)
|
| 5 |
+
|
| 6 |
+
for t in d.get("tasks", []):
|
| 7 |
+
g = t.get("grader")
|
| 8 |
+
print(f"{t['id']}: grader type = {type(g).__name__}, value = {g}")
|
openenv.yaml
CHANGED
|
@@ -15,13 +15,37 @@ endpoints:
|
|
| 15 |
tasks:
|
| 16 |
- id: billing_refund_easy
|
| 17 |
difficulty: easy
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
- id: account_takeover_medium
|
| 20 |
difficulty: medium
|
| 21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
- id: api_incident_hard
|
| 23 |
difficulty: hard
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
- id: regulated_export_exception_hard
|
| 26 |
difficulty: hard
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
tasks:
|
| 16 |
- id: billing_refund_easy
|
| 17 |
difficulty: easy
|
| 18 |
+
max_steps: 6
|
| 19 |
+
grader:
|
| 20 |
+
type: llm
|
| 21 |
+
prompt_template: |
|
| 22 |
+
You are a QA judge. Score the agent's handling of a duplicate-charge refund ticket from 0.0 to 1.0.
|
| 23 |
+
Full score if: queue=billing_ops, priority=high, issue_type=duplicate_charge, status=resolved,
|
| 24 |
+
reply confirms refund with reference/timeline, and internal note mentions duplicate charge/refund.
|
| 25 |
- id: account_takeover_medium
|
| 26 |
difficulty: medium
|
| 27 |
+
max_steps: 7
|
| 28 |
+
grader:
|
| 29 |
+
type: llm
|
| 30 |
+
prompt_template: |
|
| 31 |
+
You are a QA judge. Score 0.0–1.0 for account-compromise recovery.
|
| 32 |
+
Full score if: account locked, sessions invalidated, ownership verification requested, queue=trust_and_safety,
|
| 33 |
+
status escalated/waiting_on_customer, and reply is security-appropriate (no promises).
|
| 34 |
- id: api_incident_hard
|
| 35 |
difficulty: hard
|
| 36 |
+
max_steps: 8
|
| 37 |
+
grader:
|
| 38 |
+
type: llm
|
| 39 |
+
prompt_template: |
|
| 40 |
+
You are a QA judge. Score 0.0–1.0 for production API incident triage.
|
| 41 |
+
Full score if: request IDs/time window collected, status page mentioned, escalation to platform_engineering,
|
| 42 |
+
queue=platform_engineering, and reply acknowledges incident.
|
| 43 |
- id: regulated_export_exception_hard
|
| 44 |
difficulty: hard
|
| 45 |
+
max_steps: 8
|
| 46 |
+
grader:
|
| 47 |
+
type: llm
|
| 48 |
+
prompt_template: |
|
| 49 |
+
You are a QA judge. Score 0.0–1.0 for regulated export exception handling.
|
| 50 |
+
Full score if: compliance review required, no promise of approval, recipient identity/duration/safeguards requested,
|
| 51 |
+
queue=compliance_ops, status=waiting_on_customer.
|
temp_check.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import yaml
|
| 2 |
+
|
| 3 |
+
d = yaml.safe_load(open('"'"'openenv.yaml'"'"'))
|
| 4 |
+
for t in d.get('"'"'tasks'"'"', []):
|
| 5 |
+
g = t.get('"'"'grader'"'"')
|
| 6 |
+
print(f"{t['"'"'id'"'"']}: grader type = {type(g).__name__}, value = {g}")
|
tests/test_supportdesk.py
CHANGED
|
@@ -117,15 +117,9 @@ def test_openenv_manifest_graders_are_importable():
|
|
| 117 |
|
| 118 |
for task in manifest["tasks"]:
|
| 119 |
grader_block = task["grader"]
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
grader_path = grader_block
|
| 124 |
-
module_name, object_name = grader_path.split(":", 1)
|
| 125 |
-
module = importlib.import_module(module_name)
|
| 126 |
-
grader_cls = getattr(module, object_name)
|
| 127 |
-
score = grader_cls().grade(SupportCaseProgress())
|
| 128 |
-
assert 0.0 < score < 1.0
|
| 129 |
|
| 130 |
|
| 131 |
def test_state_includes_episode_id_after_reset():
|
|
|
|
| 117 |
|
| 118 |
for task in manifest["tasks"]:
|
| 119 |
grader_block = task["grader"]
|
| 120 |
+
assert isinstance(grader_block, dict)
|
| 121 |
+
assert grader_block.get("type") == "llm"
|
| 122 |
+
assert isinstance(grader_block.get("prompt_template"), str)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
|
| 124 |
|
| 125 |
def test_state_includes_episode_id_after_reset():
|