Spaces:

srishtichugh
/

orgOS

Running

App Files Files Community

muskan singh commited on 13 days ago

Commit

ef4ebed

1 Parent(s): dd5113f

quick fix

Browse files

Files changed (3) hide show

baseline_scores.json +4 -4
inference.py +24 -18
server/environment.py +10 -22

baseline_scores.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
   "scores": {
-    "workflow_A": 0.137,
-    "workflow_B": 0.563,
-    "workflow_C": 0.522
   },
-  "average": 0.4073
 }

 {
   "scores": {
+    "workflow_A": 0.613,
+    "workflow_B": 0.65,
+    "workflow_C": 0.5
   },
+  "average": 0.5877
 }

inference.py CHANGED Viewed

@@ -47,12 +47,11 @@ SYSTEM_PROMPT = """\
 You are OrgOS Agent — an enterprise workflow automation agent.
 You operate across four SaaS applications: Jira, Zendesk, Salesforce, and Workday.
-Each turn you receive an observation with:
-  - workflow_goal    : the business objective you must achieve
-  - completed_steps  : step IDs already finished (e.g. ["A1", "A2"])
-  - app_states       : current state of each app — your primary source of truth
-  - schema_hints     : PARTIAL field renames in effect (e.g. {"jira.priority": "severity"})
-                       Not all drift is revealed — probe with get_* if a field is rejected.
   - active_rules     : current SLA / approval thresholds
   - message          : feedback from the last action
   - current_score    : your cumulative score (0.001–0.999)
@@ -75,12 +74,20 @@ Available apps and key operations:
 CRITICAL RULES:
 1. Read schema_hints FIRST — if "jira.priority" → "severity", use "severity" not "priority" in args.
-   If a field is rejected, use get_* or list_* to probe the current schema before retrying.
-2. Inspect app_states to determine what has been done and what still needs action.
-3. Use list_* and get_* operations to discover record IDs — never assume them.
-4. Do not repeat a successful action.
-5. If an operation fails, read the message and adapt your field names or args.
-6. Stop when completed_steps covers all workflow steps or done=true.
 """
 WORKFLOW_NAMES = {
@@ -134,19 +141,18 @@ def api_get(path: str) -> dict:
 # ------------------------------------------------------------------
 def obs_to_text(obs: dict) -> str:
-    completed = obs.get("completed_steps", [])
-    completed_str = ", ".join(completed) if completed else "none"
     lines = [
         f"current_score:  {obs['current_score']}",
         f"step_count:     {obs['step_count']}",
         f"workflow_id:    {obs['workflow_id']}",
-        f"completed_steps: [{completed_str}]",
         "",
         "=== WORKFLOW GOAL ===",
         obs["workflow_goal"],
         "",
-        "=== SCHEMA HINTS (partial — probe with get_* for unknown fields) ===",
         json.dumps(obs["schema_hints"], indent=2) if obs["schema_hints"] else "  (no drift — use canonical names)",
         "",
         "=== ACTIVE RULES ===",
@@ -155,7 +161,7 @@ def obs_to_text(obs: dict) -> str:
         "=== LAST MESSAGE ===",
         obs["message"],
         "",
-        "=== APP STATES (use these to determine what still needs to be done) ===",
     ]
     for app_name, view in obs.get("app_states", {}).items():
         lines.append(f"  [{app_name.upper()}]")

 You are OrgOS Agent — an enterprise workflow automation agent.
 You operate across four SaaS applications: Jira, Zendesk, Salesforce, and Workday.
+Each turn you receive a JSON observation with:
+  - workflow_goal    : the task you must complete
+  - pending_steps    : remaining steps in the workflow
+  - app_states       : current state of each app
+  - schema_hints     : field renames in effect this episode (e.g. {"jira.priority": "severity"})
   - active_rules     : current SLA / approval thresholds
   - message          : feedback from the last action
   - current_score    : your cumulative score (0.001–0.999)
 CRITICAL RULES:
 1. Read schema_hints FIRST — if "jira.priority" → "severity", use "severity" not "priority" in args.
+2. Complete ALL pending_steps in order.
+3. Do not repeat a successful action.
+4. If an operation fails, read the message carefully and adapt.
+5. Use list_* operations to discover record IDs when needed.
+6. Stop when pending_steps is empty or done=true.
+Example actions:
+  {"app": "zendesk", "operation": "list_tickets", "args": {"state": "new"}}
+  {"app": "zendesk", "operation": "acknowledge_ticket", "args": {"ticket_number": "<ticket_number from list_tickets>"}}
+  {"app": "jira", "operation": "create_issue", "args": {"title": "Bug fix for <customer>", "linked_zendesk": "<ticket_number>"}}
+  {"app": "salesforce", "operation": "list_accounts", "args": {"health": "red"}}
+  {"app": "salesforce", "operation": "get_account", "args": {"account_id": "<account_id from list_accounts>"}}
+  {"app": "workday", "operation": "list_employees", "args": {"status": "pending"}}
+  {"app": "workday", "operation": "log_sla_event", "args": {"ticket_id": "<ticket_number>", "sla_met": true}}
 """
 WORKFLOW_NAMES = {
 # ------------------------------------------------------------------
 def obs_to_text(obs: dict) -> str:
     lines = [
         f"current_score:  {obs['current_score']}",
         f"step_count:     {obs['step_count']}",
         f"workflow_id:    {obs['workflow_id']}",
         "",
         "=== WORKFLOW GOAL ===",
         obs["workflow_goal"],
         "",
+        "=== PENDING STEPS ===",
+        "\n".join(f"  - {s}" for s in obs["pending_steps"]) or "  (all steps complete!)",
+        "",
+        "=== SCHEMA HINTS (use these field names) ===",
         json.dumps(obs["schema_hints"], indent=2) if obs["schema_hints"] else "  (no drift — use canonical names)",
         "",
         "=== ACTIVE RULES ===",
         "=== LAST MESSAGE ===",
         obs["message"],
         "",
+        "=== APP STATES ===",
     ]
     for app_name, view in obs.get("app_states", {}).items():
         lines.append(f"  [{app_name.upper()}]")

server/environment.py CHANGED Viewed

@@ -87,14 +87,11 @@ class OrgOSEnvironment:
         old_score    = self._last_score
         extra_penalty = 0.0
-        # Check max-steps first — applies regardless of action outcome
-        at_max = self._step_count >= self.MAX_STEPS[self._workflow_id]
         # 1. Validate app exists
         if action.app not in self._apps:
             return self._build_obs(
                 reward=-0.05,
-                done=at_max,
                 message=f"Unknown app '{action.app}'. Valid apps: {list(self._apps)}",
             )
@@ -107,7 +104,7 @@ class OrgOSEnvironment:
             extra_penalty    = rule_penalty
             return self._build_obs(
                 reward=extra_penalty,
-                done=at_max,
                 message=f"Rule violation: {reason}",
             )
@@ -119,7 +116,7 @@ class OrgOSEnvironment:
             self._efficiency -= 0.02
             return self._build_obs(
                 reward=-0.20,
-                done=at_max,
                 message=(
                     f"Stale schema: field '{result['schema_error']}' is no longer valid. "
                     "Check schema_hints for the current field name. "
@@ -131,26 +128,26 @@ class OrgOSEnvironment:
             self._efficiency -= 0.02   # penalize failed/no-op actions
             return self._build_obs(
                 reward=-0.01,
-                done=at_max,
                 message=result.get("message", "Operation failed"),
             )
-        # Earn compliance + efficiency for every successful action
-        self._rule_score = min(1.0, self._rule_score + 0.10)
-        self._efficiency = min(1.0, self._efficiency + 0.10)
         # Schema adaptation bonus (agent used correct drifted field name)
         if result.get("schema_adapted"):
             self._schema_score = min(1.0, self._schema_score + 0.10)
             self._policy_score = min(1.0, self._policy_score + 0.05)
         # 5. Re-evaluate workflow completion
         self._wf_score = self._workflow.evaluate(self._apps)
         # 6. SLA check (only if a ticket was touched)
         sla_ok, sla_pen = self._rules.check_sla(
             result.get("ticket", {}),
-            self._step_count * 2.5,   # approximate 2.5 min per step
         )
         if not sla_ok:
             extra_penalty   += sla_pen
@@ -227,17 +224,8 @@ class OrgOSEnvironment:
         # Workflow progress
         completed_steps = self._workflow.get_completed()
         workflow_goal   = self._workflow.get_goal()
-        total_steps     = len(self._workflow._steps)
-        steps_done      = len(completed_steps)
-        # Expose only a progress count — agent must infer what's next from app states
-        if steps_done == total_steps:
-            pending_steps = []
-        else:
-            pending_steps = [
-                f"{steps_done}/{total_steps} steps completed — "
-                "inspect app states and schema_hints to determine your next action."
-            ]
         # Reward breakdown snapshot
         breakdown = RewardBreakdown(

         old_score    = self._last_score
         extra_penalty = 0.0
         # 1. Validate app exists
         if action.app not in self._apps:
             return self._build_obs(
                 reward=-0.05,
+                done=False,
                 message=f"Unknown app '{action.app}'. Valid apps: {list(self._apps)}",
             )
             extra_penalty    = rule_penalty
             return self._build_obs(
                 reward=extra_penalty,
+                done=False,
                 message=f"Rule violation: {reason}",
             )
             self._efficiency -= 0.02
             return self._build_obs(
                 reward=-0.20,
+                done=False,
                 message=(
                     f"Stale schema: field '{result['schema_error']}' is no longer valid. "
                     "Check schema_hints for the current field name. "
             self._efficiency -= 0.02   # penalize failed/no-op actions
             return self._build_obs(
                 reward=-0.01,
+                done=False,
                 message=result.get("message", "Operation failed"),
             )
         # Schema adaptation bonus (agent used correct drifted field name)
         if result.get("schema_adapted"):
             self._schema_score = min(1.0, self._schema_score + 0.10)
             self._policy_score = min(1.0, self._policy_score + 0.05)
+        # Earn compliance + efficiency for every successful action
+        self._rule_score = min(1.0, self._rule_score + 0.10)
+        self._efficiency = min(1.0, self._efficiency + 0.10)
         # 5. Re-evaluate workflow completion
         self._wf_score = self._workflow.evaluate(self._apps)
         # 6. SLA check (only if a ticket was touched)
         sla_ok, sla_pen = self._rules.check_sla(
             result.get("ticket", {}),
+            self._step_count * 10,    # 10 min per step — P1 breaches at step 24 (step 12 under policy drift)
         )
         if not sla_ok:
             extra_penalty   += sla_pen
         # Workflow progress
         completed_steps = self._workflow.get_completed()
+        pending_steps   = self._workflow.get_pending()
         workflow_goal   = self._workflow.get_goal()
         # Reward breakdown snapshot
         breakdown = RewardBreakdown(