Spaces:

modelbuilderhq
/

HyperBrickCaseOps

Sleeping

App Files Files Community

modelbuilderhq commited on 30 days ago

Commit

551c5bc

verified ·

1 Parent(s): 8c82a6f

Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

check_graders.py +6 -0
check_yaml.py +8 -0
openenv.yaml +28 -4
temp_check.py +6 -0
tests/test_supportdesk.py +3 -9

check_graders.py ADDED Viewed

	@@ -0,0 +1,6 @@

+import yaml
+d = yaml.safe_load(open('"'"'openenv.yaml'"'"'))
+for t in d.get('"'"'tasks'"'"', []):
+    g = t.get('"'"'grader'"'"')
+    print(f"{t['"'"'id'"'"']}: grader type = {type(g).__name__}, value = {g}")

check_yaml.py ADDED Viewed

	@@ -0,0 +1,8 @@

+import yaml
+with open('openenv.yaml') as f:
+    d = yaml.safe_load(f)
+for t in d.get("tasks", []):
+    g = t.get("grader")
+    print(f"{t['id']}: grader type = {type(g).__name__}, value = {g}")

openenv.yaml CHANGED Viewed

@@ -15,13 +15,37 @@ endpoints:
 tasks:
   - id: billing_refund_easy
     difficulty: easy
-    grader: graders:BillingRefundEasyGrader
   - id: account_takeover_medium
     difficulty: medium
-    grader: graders:AccountTakeoverMediumGrader
   - id: api_incident_hard
     difficulty: hard
-    grader: graders:ApiIncidentHardGrader
   - id: regulated_export_exception_hard
     difficulty: hard
-    grader: graders:RegulatedExportExceptionHardGrader

 tasks:
   - id: billing_refund_easy
     difficulty: easy
+    max_steps: 6
+    grader:
+      type: llm
+      prompt_template: |
+        You are a QA judge. Score the agent's handling of a duplicate-charge refund ticket from 0.0 to 1.0.
+        Full score if: queue=billing_ops, priority=high, issue_type=duplicate_charge, status=resolved,
+        reply confirms refund with reference/timeline, and internal note mentions duplicate charge/refund.
   - id: account_takeover_medium
     difficulty: medium
+    max_steps: 7
+    grader:
+      type: llm
+      prompt_template: |
+        You are a QA judge. Score 0.0–1.0 for account-compromise recovery.
+        Full score if: account locked, sessions invalidated, ownership verification requested, queue=trust_and_safety,
+        status escalated/waiting_on_customer, and reply is security-appropriate (no promises).
   - id: api_incident_hard
     difficulty: hard
+    max_steps: 8
+    grader:
+      type: llm
+      prompt_template: |
+        You are a QA judge. Score 0.0–1.0 for production API incident triage.
+        Full score if: request IDs/time window collected, status page mentioned, escalation to platform_engineering,
+        queue=platform_engineering, and reply acknowledges incident.
   - id: regulated_export_exception_hard
     difficulty: hard
+    max_steps: 8
+    grader:
+      type: llm
+      prompt_template: |
+        You are a QA judge. Score 0.0–1.0 for regulated export exception handling.
+        Full score if: compliance review required, no promise of approval, recipient identity/duration/safeguards requested,
+        queue=compliance_ops, status=waiting_on_customer.

temp_check.py ADDED Viewed

	@@ -0,0 +1,6 @@

+import yaml
+d = yaml.safe_load(open('"'"'openenv.yaml'"'"'))
+for t in d.get('"'"'tasks'"'"', []):
+    g = t.get('"'"'grader'"'"')
+    print(f"{t['"'"'id'"'"']}: grader type = {type(g).__name__}, value = {g}")

tests/test_supportdesk.py CHANGED Viewed

@@ -117,15 +117,9 @@ def test_openenv_manifest_graders_are_importable():
     for task in manifest["tasks"]:
         grader_block = task["grader"]
-        if isinstance(grader_block, dict):
-            grader_path = grader_block["path"]
-        else:
-            grader_path = grader_block
-        module_name, object_name = grader_path.split(":", 1)
-        module = importlib.import_module(module_name)
-        grader_cls = getattr(module, object_name)
-        score = grader_cls().grade(SupportCaseProgress())
-        assert 0.0 < score < 1.0
 def test_state_includes_episode_id_after_reset():

     for task in manifest["tasks"]:
         grader_block = task["grader"]
+        assert isinstance(grader_block, dict)
+        assert grader_block.get("type") == "llm"
+        assert isinstance(grader_block.get("prompt_template"), str)
 def test_state_includes_episode_id_after_reset():