Spaces:
Running
Running
muskan singh commited on
Commit Β·
ef4ebed
1
Parent(s): dd5113f
quick fix
Browse files- baseline_scores.json +4 -4
- inference.py +24 -18
- server/environment.py +10 -22
baseline_scores.json
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
{
|
| 2 |
"scores": {
|
| 3 |
-
"workflow_A": 0.
|
| 4 |
-
"workflow_B": 0.
|
| 5 |
-
"workflow_C": 0.
|
| 6 |
},
|
| 7 |
-
"average": 0.
|
| 8 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"scores": {
|
| 3 |
+
"workflow_A": 0.613,
|
| 4 |
+
"workflow_B": 0.65,
|
| 5 |
+
"workflow_C": 0.5
|
| 6 |
},
|
| 7 |
+
"average": 0.5877
|
| 8 |
}
|
inference.py
CHANGED
|
@@ -47,12 +47,11 @@ SYSTEM_PROMPT = """\
|
|
| 47 |
You are OrgOS Agent β an enterprise workflow automation agent.
|
| 48 |
You operate across four SaaS applications: Jira, Zendesk, Salesforce, and Workday.
|
| 49 |
|
| 50 |
-
Each turn you receive
|
| 51 |
-
- workflow_goal : the
|
| 52 |
-
-
|
| 53 |
-
- app_states : current state of each app
|
| 54 |
-
- schema_hints :
|
| 55 |
-
Not all drift is revealed β probe with get_* if a field is rejected.
|
| 56 |
- active_rules : current SLA / approval thresholds
|
| 57 |
- message : feedback from the last action
|
| 58 |
- current_score : your cumulative score (0.001β0.999)
|
|
@@ -75,12 +74,20 @@ Available apps and key operations:
|
|
| 75 |
|
| 76 |
CRITICAL RULES:
|
| 77 |
1. Read schema_hints FIRST β if "jira.priority" β "severity", use "severity" not "priority" in args.
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
"""
|
| 85 |
|
| 86 |
WORKFLOW_NAMES = {
|
|
@@ -134,19 +141,18 @@ def api_get(path: str) -> dict:
|
|
| 134 |
# ------------------------------------------------------------------
|
| 135 |
|
| 136 |
def obs_to_text(obs: dict) -> str:
|
| 137 |
-
completed = obs.get("completed_steps", [])
|
| 138 |
-
completed_str = ", ".join(completed) if completed else "none"
|
| 139 |
-
|
| 140 |
lines = [
|
| 141 |
f"current_score: {obs['current_score']}",
|
| 142 |
f"step_count: {obs['step_count']}",
|
| 143 |
f"workflow_id: {obs['workflow_id']}",
|
| 144 |
-
f"completed_steps: [{completed_str}]",
|
| 145 |
"",
|
| 146 |
"=== WORKFLOW GOAL ===",
|
| 147 |
obs["workflow_goal"],
|
| 148 |
"",
|
| 149 |
-
"===
|
|
|
|
|
|
|
|
|
|
| 150 |
json.dumps(obs["schema_hints"], indent=2) if obs["schema_hints"] else " (no drift β use canonical names)",
|
| 151 |
"",
|
| 152 |
"=== ACTIVE RULES ===",
|
|
@@ -155,7 +161,7 @@ def obs_to_text(obs: dict) -> str:
|
|
| 155 |
"=== LAST MESSAGE ===",
|
| 156 |
obs["message"],
|
| 157 |
"",
|
| 158 |
-
"=== APP STATES
|
| 159 |
]
|
| 160 |
for app_name, view in obs.get("app_states", {}).items():
|
| 161 |
lines.append(f" [{app_name.upper()}]")
|
|
|
|
| 47 |
You are OrgOS Agent β an enterprise workflow automation agent.
|
| 48 |
You operate across four SaaS applications: Jira, Zendesk, Salesforce, and Workday.
|
| 49 |
|
| 50 |
+
Each turn you receive a JSON observation with:
|
| 51 |
+
- workflow_goal : the task you must complete
|
| 52 |
+
- pending_steps : remaining steps in the workflow
|
| 53 |
+
- app_states : current state of each app
|
| 54 |
+
- schema_hints : field renames in effect this episode (e.g. {"jira.priority": "severity"})
|
|
|
|
| 55 |
- active_rules : current SLA / approval thresholds
|
| 56 |
- message : feedback from the last action
|
| 57 |
- current_score : your cumulative score (0.001β0.999)
|
|
|
|
| 74 |
|
| 75 |
CRITICAL RULES:
|
| 76 |
1. Read schema_hints FIRST β if "jira.priority" β "severity", use "severity" not "priority" in args.
|
| 77 |
+
2. Complete ALL pending_steps in order.
|
| 78 |
+
3. Do not repeat a successful action.
|
| 79 |
+
4. If an operation fails, read the message carefully and adapt.
|
| 80 |
+
5. Use list_* operations to discover record IDs when needed.
|
| 81 |
+
6. Stop when pending_steps is empty or done=true.
|
| 82 |
+
|
| 83 |
+
Example actions:
|
| 84 |
+
{"app": "zendesk", "operation": "list_tickets", "args": {"state": "new"}}
|
| 85 |
+
{"app": "zendesk", "operation": "acknowledge_ticket", "args": {"ticket_number": "<ticket_number from list_tickets>"}}
|
| 86 |
+
{"app": "jira", "operation": "create_issue", "args": {"title": "Bug fix for <customer>", "linked_zendesk": "<ticket_number>"}}
|
| 87 |
+
{"app": "salesforce", "operation": "list_accounts", "args": {"health": "red"}}
|
| 88 |
+
{"app": "salesforce", "operation": "get_account", "args": {"account_id": "<account_id from list_accounts>"}}
|
| 89 |
+
{"app": "workday", "operation": "list_employees", "args": {"status": "pending"}}
|
| 90 |
+
{"app": "workday", "operation": "log_sla_event", "args": {"ticket_id": "<ticket_number>", "sla_met": true}}
|
| 91 |
"""
|
| 92 |
|
| 93 |
WORKFLOW_NAMES = {
|
|
|
|
| 141 |
# ------------------------------------------------------------------
|
| 142 |
|
| 143 |
def obs_to_text(obs: dict) -> str:
|
|
|
|
|
|
|
|
|
|
| 144 |
lines = [
|
| 145 |
f"current_score: {obs['current_score']}",
|
| 146 |
f"step_count: {obs['step_count']}",
|
| 147 |
f"workflow_id: {obs['workflow_id']}",
|
|
|
|
| 148 |
"",
|
| 149 |
"=== WORKFLOW GOAL ===",
|
| 150 |
obs["workflow_goal"],
|
| 151 |
"",
|
| 152 |
+
"=== PENDING STEPS ===",
|
| 153 |
+
"\n".join(f" - {s}" for s in obs["pending_steps"]) or " (all steps complete!)",
|
| 154 |
+
"",
|
| 155 |
+
"=== SCHEMA HINTS (use these field names) ===",
|
| 156 |
json.dumps(obs["schema_hints"], indent=2) if obs["schema_hints"] else " (no drift β use canonical names)",
|
| 157 |
"",
|
| 158 |
"=== ACTIVE RULES ===",
|
|
|
|
| 161 |
"=== LAST MESSAGE ===",
|
| 162 |
obs["message"],
|
| 163 |
"",
|
| 164 |
+
"=== APP STATES ===",
|
| 165 |
]
|
| 166 |
for app_name, view in obs.get("app_states", {}).items():
|
| 167 |
lines.append(f" [{app_name.upper()}]")
|
server/environment.py
CHANGED
|
@@ -87,14 +87,11 @@ class OrgOSEnvironment:
|
|
| 87 |
old_score = self._last_score
|
| 88 |
extra_penalty = 0.0
|
| 89 |
|
| 90 |
-
# Check max-steps first β applies regardless of action outcome
|
| 91 |
-
at_max = self._step_count >= self.MAX_STEPS[self._workflow_id]
|
| 92 |
-
|
| 93 |
# 1. Validate app exists
|
| 94 |
if action.app not in self._apps:
|
| 95 |
return self._build_obs(
|
| 96 |
reward=-0.05,
|
| 97 |
-
done=
|
| 98 |
message=f"Unknown app '{action.app}'. Valid apps: {list(self._apps)}",
|
| 99 |
)
|
| 100 |
|
|
@@ -107,7 +104,7 @@ class OrgOSEnvironment:
|
|
| 107 |
extra_penalty = rule_penalty
|
| 108 |
return self._build_obs(
|
| 109 |
reward=extra_penalty,
|
| 110 |
-
done=
|
| 111 |
message=f"Rule violation: {reason}",
|
| 112 |
)
|
| 113 |
|
|
@@ -119,7 +116,7 @@ class OrgOSEnvironment:
|
|
| 119 |
self._efficiency -= 0.02
|
| 120 |
return self._build_obs(
|
| 121 |
reward=-0.20,
|
| 122 |
-
done=
|
| 123 |
message=(
|
| 124 |
f"Stale schema: field '{result['schema_error']}' is no longer valid. "
|
| 125 |
"Check schema_hints for the current field name. "
|
|
@@ -131,26 +128,26 @@ class OrgOSEnvironment:
|
|
| 131 |
self._efficiency -= 0.02 # penalize failed/no-op actions
|
| 132 |
return self._build_obs(
|
| 133 |
reward=-0.01,
|
| 134 |
-
done=
|
| 135 |
message=result.get("message", "Operation failed"),
|
| 136 |
)
|
| 137 |
|
| 138 |
-
# Earn compliance + efficiency for every successful action
|
| 139 |
-
self._rule_score = min(1.0, self._rule_score + 0.10)
|
| 140 |
-
self._efficiency = min(1.0, self._efficiency + 0.10)
|
| 141 |
-
|
| 142 |
# Schema adaptation bonus (agent used correct drifted field name)
|
| 143 |
if result.get("schema_adapted"):
|
| 144 |
self._schema_score = min(1.0, self._schema_score + 0.10)
|
| 145 |
self._policy_score = min(1.0, self._policy_score + 0.05)
|
| 146 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
# 5. Re-evaluate workflow completion
|
| 148 |
self._wf_score = self._workflow.evaluate(self._apps)
|
| 149 |
|
| 150 |
# 6. SLA check (only if a ticket was touched)
|
| 151 |
sla_ok, sla_pen = self._rules.check_sla(
|
| 152 |
result.get("ticket", {}),
|
| 153 |
-
self._step_count *
|
| 154 |
)
|
| 155 |
if not sla_ok:
|
| 156 |
extra_penalty += sla_pen
|
|
@@ -227,17 +224,8 @@ class OrgOSEnvironment:
|
|
| 227 |
|
| 228 |
# Workflow progress
|
| 229 |
completed_steps = self._workflow.get_completed()
|
|
|
|
| 230 |
workflow_goal = self._workflow.get_goal()
|
| 231 |
-
total_steps = len(self._workflow._steps)
|
| 232 |
-
steps_done = len(completed_steps)
|
| 233 |
-
# Expose only a progress count β agent must infer what's next from app states
|
| 234 |
-
if steps_done == total_steps:
|
| 235 |
-
pending_steps = []
|
| 236 |
-
else:
|
| 237 |
-
pending_steps = [
|
| 238 |
-
f"{steps_done}/{total_steps} steps completed β "
|
| 239 |
-
"inspect app states and schema_hints to determine your next action."
|
| 240 |
-
]
|
| 241 |
|
| 242 |
# Reward breakdown snapshot
|
| 243 |
breakdown = RewardBreakdown(
|
|
|
|
| 87 |
old_score = self._last_score
|
| 88 |
extra_penalty = 0.0
|
| 89 |
|
|
|
|
|
|
|
|
|
|
| 90 |
# 1. Validate app exists
|
| 91 |
if action.app not in self._apps:
|
| 92 |
return self._build_obs(
|
| 93 |
reward=-0.05,
|
| 94 |
+
done=False,
|
| 95 |
message=f"Unknown app '{action.app}'. Valid apps: {list(self._apps)}",
|
| 96 |
)
|
| 97 |
|
|
|
|
| 104 |
extra_penalty = rule_penalty
|
| 105 |
return self._build_obs(
|
| 106 |
reward=extra_penalty,
|
| 107 |
+
done=False,
|
| 108 |
message=f"Rule violation: {reason}",
|
| 109 |
)
|
| 110 |
|
|
|
|
| 116 |
self._efficiency -= 0.02
|
| 117 |
return self._build_obs(
|
| 118 |
reward=-0.20,
|
| 119 |
+
done=False,
|
| 120 |
message=(
|
| 121 |
f"Stale schema: field '{result['schema_error']}' is no longer valid. "
|
| 122 |
"Check schema_hints for the current field name. "
|
|
|
|
| 128 |
self._efficiency -= 0.02 # penalize failed/no-op actions
|
| 129 |
return self._build_obs(
|
| 130 |
reward=-0.01,
|
| 131 |
+
done=False,
|
| 132 |
message=result.get("message", "Operation failed"),
|
| 133 |
)
|
| 134 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
# Schema adaptation bonus (agent used correct drifted field name)
|
| 136 |
if result.get("schema_adapted"):
|
| 137 |
self._schema_score = min(1.0, self._schema_score + 0.10)
|
| 138 |
self._policy_score = min(1.0, self._policy_score + 0.05)
|
| 139 |
|
| 140 |
+
# Earn compliance + efficiency for every successful action
|
| 141 |
+
self._rule_score = min(1.0, self._rule_score + 0.10)
|
| 142 |
+
self._efficiency = min(1.0, self._efficiency + 0.10)
|
| 143 |
+
|
| 144 |
# 5. Re-evaluate workflow completion
|
| 145 |
self._wf_score = self._workflow.evaluate(self._apps)
|
| 146 |
|
| 147 |
# 6. SLA check (only if a ticket was touched)
|
| 148 |
sla_ok, sla_pen = self._rules.check_sla(
|
| 149 |
result.get("ticket", {}),
|
| 150 |
+
self._step_count * 10, # 10 min per step β P1 breaches at step 24 (step 12 under policy drift)
|
| 151 |
)
|
| 152 |
if not sla_ok:
|
| 153 |
extra_penalty += sla_pen
|
|
|
|
| 224 |
|
| 225 |
# Workflow progress
|
| 226 |
completed_steps = self._workflow.get_completed()
|
| 227 |
+
pending_steps = self._workflow.get_pending()
|
| 228 |
workflow_goal = self._workflow.get_goal()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
|
| 230 |
# Reward breakdown snapshot
|
| 231 |
breakdown = RewardBreakdown(
|