muskan singh commited on
Commit
ef4ebed
Β·
1 Parent(s): dd5113f

quick fix

Browse files
Files changed (3) hide show
  1. baseline_scores.json +4 -4
  2. inference.py +24 -18
  3. server/environment.py +10 -22
baseline_scores.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "scores": {
3
- "workflow_A": 0.137,
4
- "workflow_B": 0.563,
5
- "workflow_C": 0.522
6
  },
7
- "average": 0.4073
8
  }
 
1
  {
2
  "scores": {
3
+ "workflow_A": 0.613,
4
+ "workflow_B": 0.65,
5
+ "workflow_C": 0.5
6
  },
7
+ "average": 0.5877
8
  }
inference.py CHANGED
@@ -47,12 +47,11 @@ SYSTEM_PROMPT = """\
47
  You are OrgOS Agent β€” an enterprise workflow automation agent.
48
  You operate across four SaaS applications: Jira, Zendesk, Salesforce, and Workday.
49
 
50
- Each turn you receive an observation with:
51
- - workflow_goal : the business objective you must achieve
52
- - completed_steps : step IDs already finished (e.g. ["A1", "A2"])
53
- - app_states : current state of each app β€” your primary source of truth
54
- - schema_hints : PARTIAL field renames in effect (e.g. {"jira.priority": "severity"})
55
- Not all drift is revealed β€” probe with get_* if a field is rejected.
56
  - active_rules : current SLA / approval thresholds
57
  - message : feedback from the last action
58
  - current_score : your cumulative score (0.001–0.999)
@@ -75,12 +74,20 @@ Available apps and key operations:
75
 
76
  CRITICAL RULES:
77
  1. Read schema_hints FIRST β€” if "jira.priority" β†’ "severity", use "severity" not "priority" in args.
78
- If a field is rejected, use get_* or list_* to probe the current schema before retrying.
79
- 2. Inspect app_states to determine what has been done and what still needs action.
80
- 3. Use list_* and get_* operations to discover record IDs β€” never assume them.
81
- 4. Do not repeat a successful action.
82
- 5. If an operation fails, read the message and adapt your field names or args.
83
- 6. Stop when completed_steps covers all workflow steps or done=true.
 
 
 
 
 
 
 
 
84
  """
85
 
86
  WORKFLOW_NAMES = {
@@ -134,19 +141,18 @@ def api_get(path: str) -> dict:
134
  # ------------------------------------------------------------------
135
 
136
  def obs_to_text(obs: dict) -> str:
137
- completed = obs.get("completed_steps", [])
138
- completed_str = ", ".join(completed) if completed else "none"
139
-
140
  lines = [
141
  f"current_score: {obs['current_score']}",
142
  f"step_count: {obs['step_count']}",
143
  f"workflow_id: {obs['workflow_id']}",
144
- f"completed_steps: [{completed_str}]",
145
  "",
146
  "=== WORKFLOW GOAL ===",
147
  obs["workflow_goal"],
148
  "",
149
- "=== SCHEMA HINTS (partial β€” probe with get_* for unknown fields) ===",
 
 
 
150
  json.dumps(obs["schema_hints"], indent=2) if obs["schema_hints"] else " (no drift β€” use canonical names)",
151
  "",
152
  "=== ACTIVE RULES ===",
@@ -155,7 +161,7 @@ def obs_to_text(obs: dict) -> str:
155
  "=== LAST MESSAGE ===",
156
  obs["message"],
157
  "",
158
- "=== APP STATES (use these to determine what still needs to be done) ===",
159
  ]
160
  for app_name, view in obs.get("app_states", {}).items():
161
  lines.append(f" [{app_name.upper()}]")
 
47
  You are OrgOS Agent β€” an enterprise workflow automation agent.
48
  You operate across four SaaS applications: Jira, Zendesk, Salesforce, and Workday.
49
 
50
+ Each turn you receive a JSON observation with:
51
+ - workflow_goal : the task you must complete
52
+ - pending_steps : remaining steps in the workflow
53
+ - app_states : current state of each app
54
+ - schema_hints : field renames in effect this episode (e.g. {"jira.priority": "severity"})
 
55
  - active_rules : current SLA / approval thresholds
56
  - message : feedback from the last action
57
  - current_score : your cumulative score (0.001–0.999)
 
74
 
75
  CRITICAL RULES:
76
  1. Read schema_hints FIRST β€” if "jira.priority" β†’ "severity", use "severity" not "priority" in args.
77
+ 2. Complete ALL pending_steps in order.
78
+ 3. Do not repeat a successful action.
79
+ 4. If an operation fails, read the message carefully and adapt.
80
+ 5. Use list_* operations to discover record IDs when needed.
81
+ 6. Stop when pending_steps is empty or done=true.
82
+
83
+ Example actions:
84
+ {"app": "zendesk", "operation": "list_tickets", "args": {"state": "new"}}
85
+ {"app": "zendesk", "operation": "acknowledge_ticket", "args": {"ticket_number": "<ticket_number from list_tickets>"}}
86
+ {"app": "jira", "operation": "create_issue", "args": {"title": "Bug fix for <customer>", "linked_zendesk": "<ticket_number>"}}
87
+ {"app": "salesforce", "operation": "list_accounts", "args": {"health": "red"}}
88
+ {"app": "salesforce", "operation": "get_account", "args": {"account_id": "<account_id from list_accounts>"}}
89
+ {"app": "workday", "operation": "list_employees", "args": {"status": "pending"}}
90
+ {"app": "workday", "operation": "log_sla_event", "args": {"ticket_id": "<ticket_number>", "sla_met": true}}
91
  """
92
 
93
  WORKFLOW_NAMES = {
 
141
  # ------------------------------------------------------------------
142
 
143
  def obs_to_text(obs: dict) -> str:
 
 
 
144
  lines = [
145
  f"current_score: {obs['current_score']}",
146
  f"step_count: {obs['step_count']}",
147
  f"workflow_id: {obs['workflow_id']}",
 
148
  "",
149
  "=== WORKFLOW GOAL ===",
150
  obs["workflow_goal"],
151
  "",
152
+ "=== PENDING STEPS ===",
153
+ "\n".join(f" - {s}" for s in obs["pending_steps"]) or " (all steps complete!)",
154
+ "",
155
+ "=== SCHEMA HINTS (use these field names) ===",
156
  json.dumps(obs["schema_hints"], indent=2) if obs["schema_hints"] else " (no drift β€” use canonical names)",
157
  "",
158
  "=== ACTIVE RULES ===",
 
161
  "=== LAST MESSAGE ===",
162
  obs["message"],
163
  "",
164
+ "=== APP STATES ===",
165
  ]
166
  for app_name, view in obs.get("app_states", {}).items():
167
  lines.append(f" [{app_name.upper()}]")
server/environment.py CHANGED
@@ -87,14 +87,11 @@ class OrgOSEnvironment:
87
  old_score = self._last_score
88
  extra_penalty = 0.0
89
 
90
- # Check max-steps first β€” applies regardless of action outcome
91
- at_max = self._step_count >= self.MAX_STEPS[self._workflow_id]
92
-
93
  # 1. Validate app exists
94
  if action.app not in self._apps:
95
  return self._build_obs(
96
  reward=-0.05,
97
- done=at_max,
98
  message=f"Unknown app '{action.app}'. Valid apps: {list(self._apps)}",
99
  )
100
 
@@ -107,7 +104,7 @@ class OrgOSEnvironment:
107
  extra_penalty = rule_penalty
108
  return self._build_obs(
109
  reward=extra_penalty,
110
- done=at_max,
111
  message=f"Rule violation: {reason}",
112
  )
113
 
@@ -119,7 +116,7 @@ class OrgOSEnvironment:
119
  self._efficiency -= 0.02
120
  return self._build_obs(
121
  reward=-0.20,
122
- done=at_max,
123
  message=(
124
  f"Stale schema: field '{result['schema_error']}' is no longer valid. "
125
  "Check schema_hints for the current field name. "
@@ -131,26 +128,26 @@ class OrgOSEnvironment:
131
  self._efficiency -= 0.02 # penalize failed/no-op actions
132
  return self._build_obs(
133
  reward=-0.01,
134
- done=at_max,
135
  message=result.get("message", "Operation failed"),
136
  )
137
 
138
- # Earn compliance + efficiency for every successful action
139
- self._rule_score = min(1.0, self._rule_score + 0.10)
140
- self._efficiency = min(1.0, self._efficiency + 0.10)
141
-
142
  # Schema adaptation bonus (agent used correct drifted field name)
143
  if result.get("schema_adapted"):
144
  self._schema_score = min(1.0, self._schema_score + 0.10)
145
  self._policy_score = min(1.0, self._policy_score + 0.05)
146
 
 
 
 
 
147
  # 5. Re-evaluate workflow completion
148
  self._wf_score = self._workflow.evaluate(self._apps)
149
 
150
  # 6. SLA check (only if a ticket was touched)
151
  sla_ok, sla_pen = self._rules.check_sla(
152
  result.get("ticket", {}),
153
- self._step_count * 2.5, # approximate 2.5 min per step
154
  )
155
  if not sla_ok:
156
  extra_penalty += sla_pen
@@ -227,17 +224,8 @@ class OrgOSEnvironment:
227
 
228
  # Workflow progress
229
  completed_steps = self._workflow.get_completed()
 
230
  workflow_goal = self._workflow.get_goal()
231
- total_steps = len(self._workflow._steps)
232
- steps_done = len(completed_steps)
233
- # Expose only a progress count β€” agent must infer what's next from app states
234
- if steps_done == total_steps:
235
- pending_steps = []
236
- else:
237
- pending_steps = [
238
- f"{steps_done}/{total_steps} steps completed β€” "
239
- "inspect app states and schema_hints to determine your next action."
240
- ]
241
 
242
  # Reward breakdown snapshot
243
  breakdown = RewardBreakdown(
 
87
  old_score = self._last_score
88
  extra_penalty = 0.0
89
 
 
 
 
90
  # 1. Validate app exists
91
  if action.app not in self._apps:
92
  return self._build_obs(
93
  reward=-0.05,
94
+ done=False,
95
  message=f"Unknown app '{action.app}'. Valid apps: {list(self._apps)}",
96
  )
97
 
 
104
  extra_penalty = rule_penalty
105
  return self._build_obs(
106
  reward=extra_penalty,
107
+ done=False,
108
  message=f"Rule violation: {reason}",
109
  )
110
 
 
116
  self._efficiency -= 0.02
117
  return self._build_obs(
118
  reward=-0.20,
119
+ done=False,
120
  message=(
121
  f"Stale schema: field '{result['schema_error']}' is no longer valid. "
122
  "Check schema_hints for the current field name. "
 
128
  self._efficiency -= 0.02 # penalize failed/no-op actions
129
  return self._build_obs(
130
  reward=-0.01,
131
+ done=False,
132
  message=result.get("message", "Operation failed"),
133
  )
134
 
 
 
 
 
135
  # Schema adaptation bonus (agent used correct drifted field name)
136
  if result.get("schema_adapted"):
137
  self._schema_score = min(1.0, self._schema_score + 0.10)
138
  self._policy_score = min(1.0, self._policy_score + 0.05)
139
 
140
+ # Earn compliance + efficiency for every successful action
141
+ self._rule_score = min(1.0, self._rule_score + 0.10)
142
+ self._efficiency = min(1.0, self._efficiency + 0.10)
143
+
144
  # 5. Re-evaluate workflow completion
145
  self._wf_score = self._workflow.evaluate(self._apps)
146
 
147
  # 6. SLA check (only if a ticket was touched)
148
  sla_ok, sla_pen = self._rules.check_sla(
149
  result.get("ticket", {}),
150
+ self._step_count * 10, # 10 min per step β€” P1 breaches at step 24 (step 12 under policy drift)
151
  )
152
  if not sla_ok:
153
  extra_penalty += sla_pen
 
224
 
225
  # Workflow progress
226
  completed_steps = self._workflow.get_completed()
227
+ pending_steps = self._workflow.get_pending()
228
  workflow_goal = self._workflow.get_goal()
 
 
 
 
 
 
 
 
 
 
229
 
230
  # Reward breakdown snapshot
231
  breakdown = RewardBreakdown(