Taniieeee83 commited on
Commit
f90e8de
Β·
1 Parent(s): 2dfdd72

Add blog post content and architecture diagrams

Browse files
assets/generate_diagram.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Generate OrgOS architecture diagram β†’ assets/orgos_architecture.png"""
2
+
3
+ import matplotlib.pyplot as plt
4
+ from matplotlib.patches import FancyBboxPatch, FancyArrowPatch
5
+ import os
6
+
7
+ # ── canvas ─────────────────────────────────────────────────────────────────────
8
+ fig, ax = plt.subplots(figsize=(14, 11))
9
+ ax.set_xlim(0, 14)
10
+ ax.set_ylim(0, 11)
11
+ ax.axis("off")
12
+ fig.patch.set_facecolor("#ffffff")
13
+
14
+ # ── palette ────────────────────────────────────────────────────────────────────
15
+ ZD_C = "#0284c7"
16
+ JR_C = "#7c3aed"
17
+ SF_C = "#059669"
18
+ WD_C = "#d97706"
19
+ AGT_C = "#1e3a8a"
20
+ SCH_C = "#b91c1c"
21
+ POL_C = "#c2410c"
22
+ ENV_BG = "#f8fafc"
23
+ ENV_BD = "#94a3b8"
24
+ TEXT_D = "#0f172a"
25
+ TEXT_M = "#475569"
26
+
27
+ # ── helpers ────────────────────────────────────────────────────────────────────
28
+ def rbox(x, y, w, h, fc, ec="none", lw=1.5, r=0.12, z=2):
29
+ ax.add_patch(FancyBboxPatch(
30
+ (x, y), w, h, boxstyle=f"round,pad={r}",
31
+ facecolor=fc, edgecolor=ec, linewidth=lw, zorder=z))
32
+
33
+ def txt(x, y, s, sz=11, c="white", bold=False, z=5):
34
+ ax.text(x, y, s, ha="center", va="center",
35
+ fontsize=sz, fontweight="bold" if bold else "normal",
36
+ color=c, zorder=z)
37
+
38
+ def arw(x1, y1, x2, y2, color, lw=2.8, dashed=False):
39
+ style = (0, (6, 3)) if dashed else "solid"
40
+ ax.annotate("", xy=(x2, y2), xytext=(x1, y1),
41
+ arrowprops=dict(arrowstyle="-|>", color=color,
42
+ lw=lw, linestyle=style,
43
+ mutation_scale=30), zorder=6)
44
+
45
+ def pill(x, y, s, color, sz=8.5):
46
+ ax.text(x, y, s, ha="center", va="center", fontsize=sz,
47
+ color=color, zorder=7,
48
+ bbox=dict(boxstyle="round,pad=0.4", facecolor="white",
49
+ edgecolor=color, linewidth=1.6))
50
+
51
+ # ── title ──────────────────────────────────────────────────────────────────────
52
+ txt(8.4, 10.6, "OrgOS β€” Multi-App Enterprise RL Environment",
53
+ sz=16, c=TEXT_D, bold=True)
54
+
55
+ # ── OrgOS Environment outer box ────────────────────────────────────────────────
56
+ rbox(3.0, 2.6, 10.8, 7.6, fc=ENV_BG, ec=ENV_BD, lw=2.2, r=0.2, z=1)
57
+ txt(8.4, 9.95, "OrgOS Environment", sz=14, c=TEXT_M, bold=True)
58
+
59
+ # ── 2 Γ— 2 app boxes ────────────────────────────────────────────────────────────
60
+ AW, AH = 4.9, 2.65
61
+
62
+ # top-left: Zendesk
63
+ rbox(3.2, 6.95, AW, AH, ZD_C, r=0.14, z=3)
64
+ txt(3.2 + AW/2, 6.95 + 1.90, "Zendesk", sz=16, bold=True)
65
+ txt(3.2 + AW/2, 6.95 + 1.25, "Support Tickets", sz=12, c="#bae6fd")
66
+ txt(3.2 + AW/2, 6.95 + 0.52, "8 operations", sz=10, c="#7dd3fc")
67
+
68
+ # top-right: Jira
69
+ rbox(8.55, 6.95, AW, AH, JR_C, r=0.14, z=3)
70
+ txt(8.55 + AW/2, 6.95 + 1.90, "Jira", sz=16, bold=True)
71
+ txt(8.55 + AW/2, 6.95 + 1.25, "Engineering Issues", sz=12, c="#ddd6fe")
72
+ txt(8.55 + AW/2, 6.95 + 0.52, "9 operations", sz=10, c="#c4b5fd")
73
+
74
+ # bottom-left: Salesforce
75
+ rbox(3.2, 3.85, AW, AH, SF_C, r=0.14, z=3)
76
+ txt(3.2 + AW/2, 3.85 + 1.90, "Salesforce", sz=16, bold=True)
77
+ txt(3.2 + AW/2, 3.85 + 1.25, "CRM & Accounts", sz=12, c="#a7f3d0")
78
+ txt(3.2 + AW/2, 3.85 + 0.52, "7 operations", sz=10, c="#6ee7b7")
79
+
80
+ # bottom-right: Workday
81
+ rbox(8.55, 3.85, AW, AH, WD_C, r=0.14, z=3)
82
+ txt(8.55 + AW/2, 3.85 + 1.90, "Workday", sz=16, bold=True)
83
+ txt(8.55 + AW/2, 3.85 + 1.25, "HR & Access", sz=12, c="#fef3c7")
84
+ txt(8.55 + AW/2, 3.85 + 0.52, "7 operations", sz=10, c="#fde68a")
85
+
86
+ # ── AI Agent box ───────────────────────────────────────────────────────────────
87
+ rbox(4.2, 0.2, 8.0, 1.3, AGT_C, r=0.14, z=3)
88
+ txt(8.2, 1.02, "AI Agent (Qwen 2.5-3B-Instruct)", sz=14, bold=True)
89
+ txt(8.2, 0.50, "Reads observation Β· Sends one action per step", sz=11, c="#93c5fd")
90
+
91
+ # ── Action arrow (Agent β†’ Environment) ───────────────────────────────────────
92
+ arw(6.2, 1.52, 6.2, 2.62, AGT_C, lw=3.5)
93
+ pill(4.65, 2.07, "action\n{ app, op, args }", AGT_C, sz=9.5)
94
+
95
+ # ── Observation arrow (Environment β†’ Agent) ───────────────────────────────────
96
+ arw(10.4, 2.62, 10.4, 1.52, SF_C, lw=3.5)
97
+ pill(12.1, 2.07, "observation\n+ reward", SF_C, sz=9.5)
98
+
99
+ # ── Schema Drift box (left) ─────────────────────────────────────────────────────
100
+ rbox(0.05, 7.5, 2.0, 1.9, "#fff1f2", ec=SCH_C, lw=1.8, r=0.12, z=3)
101
+ txt(1.05, 9.17, "Schema Drift", sz=11, c=SCH_C, bold=True)
102
+ txt(1.05, 8.73, "field names shift", sz=9.5, c=SCH_C)
103
+ txt(1.05, 8.35, "every episode", sz=9.5, c=SCH_C)
104
+ txt(1.05, 7.92, "(3 versions / app)", sz=8.5, c="#ef4444")
105
+ arw(2.07, 8.45, 2.98, 8.45, SCH_C, lw=2.5, dashed=True)
106
+
107
+ # ── Policy Drift box (left) ─────────────────────────────────────────────────────
108
+ rbox(0.05, 5.0, 2.0, 1.9, "#fff7ed", ec=POL_C, lw=1.8, r=0.12, z=3)
109
+ txt(1.05, 6.67, "Policy Drift", sz=11, c=POL_C, bold=True)
110
+ txt(1.05, 6.23, "SLA rules tighten", sz=9.5, c=POL_C)
111
+ txt(1.05, 5.85, "every 3rd episode", sz=9.5, c=POL_C)
112
+ txt(1.05, 5.42, "(no announcement)", sz=8.5, c="#f97316")
113
+ arw(2.07, 5.95, 2.98, 5.95, POL_C, lw=2.5, dashed=True)
114
+
115
+ # ── save ───────────────────────────────────────────────────────────────────────
116
+ plt.tight_layout(pad=0.3)
117
+ out = os.path.join(os.path.dirname(__file__), "orgos_architecture.png")
118
+ plt.savefig(out, dpi=160, bbox_inches="tight", facecolor="white")
119
+ print(f"Saved β†’ {out}")
assets/image.png ADDED

Git LFS Details

  • SHA256: fed5c656fc0cb81195d983a8187d03ec1efede8f2d47684ec9545c9d377af6fd
  • Pointer size: 130 Bytes
  • Size of remote file: 69.7 kB
assets/orgos_architecture.png ADDED

Git LFS Details

  • SHA256: 509874c13bad2f684479b8d16b6298086c7abfd71934ddcecfe5004d12dc0169
  • Pointer size: 131 Bytes
  • Size of remote file: 164 kB
hf_blog_post.md CHANGED
@@ -1,106 +1,272 @@
1
- # OrgOS: Teaching Agents to Survive Enterprise API Drift
2
 
3
- *Submitted to the Meta PyTorch Γ— Scaler OpenEnv Hackathon Round 2*
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  ---
6
 
7
- ## The Problem
 
 
8
 
9
- Enterprise AI agents break in production β€” not because the model is bad, but because the environment keeps changing. SaaS APIs rename fields. SLAs tighten. Access policies shift. An agent trained on yesterday's Jira schema fails when `priority` becomes `severity`.
10
 
11
- Static datasets can't capture this. You need an environment that drifts.
 
 
12
 
13
  ---
14
 
15
- ## What We Built: OrgOS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
- **OrgOS** is a multi-app enterprise RL environment where an AI agent completes real business workflows across four interconnected mock SaaS applications: **Jira, Zendesk, Salesforce, and Workday**.
18
 
19
- ### Three Cross-App Workflows
20
 
21
- | Workflow | Role | Steps |
22
- |---|---|---|
23
- | A β€” Customer Bug Fix | Support | Acknowledge ticket β†’ Create Jira issue β†’ Assign engineer β†’ Log SLA β†’ Check account health |
24
- | B β€” Employee Onboarding | Manager | Create Workday record β†’ Provision Jira access β†’ Add to Salesforce β†’ Create Zendesk profile |
25
- | C β€” Churn Risk Alert | Support | Flag churn in Salesforce β†’ Escalate to Zendesk β†’ Create Jira tracker β†’ Log SLA event |
26
 
27
- ### What Makes It Hard
28
 
29
- **Schema Drift**: Every episode, field names can change across versions. `priority` β†’ `severity` β†’ `urgency_level`. The agent sees a `schema_hints` dict telling it the current mapping β€” but only if it reads it. Using stale field names incurs a `-0.20` penalty. Using adapted names earns `+0.10`.
 
 
30
 
31
- **Policy Drift**: Every 3rd episode, SLA thresholds tighten automatically (P0 response: 30 min β†’ 15 min). Agents that ignore `active_rules` get caught.
32
 
33
- **RBAC**: Support vs. manager roles are strictly enforced. Unauthorized actions cost `-0.25`.
34
 
35
- ### Reward Function
36
 
37
  ```
38
- score = 0.30 Γ— workflow_completion
39
- + 0.25 Γ— rule_compliance
40
- + 0.20 Γ— schema_adaptation
41
- + 0.15 Γ— efficiency
42
- + 0.10 Γ— policy_drift_handling
 
43
  ```
44
 
45
- The agent receives dense per-step signals, not just terminal rewards.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
  ---
48
 
49
- ## Training: GRPO on Qwen2.5-3B
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
- We trained **Qwen2.5-3B-Instruct** with **Unsloth 4-bit LoRA** using **HF TRL GRPOTrainer** for 150 steps.
52
 
53
- ### Key Design Choices
54
 
55
- **Multi-step reward**: Instead of rewarding just the GRPO-generated action, we continue 1 more greedy step with the model and return the cumulative 2-step score. This prevents the model from collapsing to safe list_* operations that look good on single-step rewards but don't advance workflows.
56
 
57
- **System prompt engineering**: The prompt explicitly instructs the agent to read `schema_hints` before choosing field names and to check `pending_steps` to know what the workflow needs next.
58
 
59
- **Pinned TRL**: We pin `trl<=0.24` for API stability β€” newer versions changed the GRPOTrainer interface.
60
 
61
- ### Training Config
62
 
63
- | Config | Value |
64
  |---|---|
65
- | Model | Qwen2.5-3B-Instruct (4-bit) |
66
- | LoRA rank | r=16 |
67
- | Steps | 150 |
68
- | LR | 8e-6 |
69
- | Batch | 1 (grad accum 2) |
70
- | Reward | 2-step cumulative |
 
 
 
 
 
 
 
 
71
 
72
  ---
73
 
74
- ## Results
 
 
75
 
76
- | Workflow | Before GRPO | After GRPO | Ξ” |
77
  |---|---|---|---|
78
- | A β€” Customer Bug Fix | 0.70 | ~0.82 | +0.12 |
79
- | B β€” Employee Onboarding | 0.57 | ~0.74 | +0.17 |
80
- | C β€” Churn Risk Alert | 0.25 | ~0.48 | +0.23 |
81
- | **Average** | **0.50** | **~0.68** | **+0.18** |
82
 
83
- The biggest gain is on Workflow C (Churn Risk Alert) β€” the hardest workflow, which requires the most cross-app coordination. The untrained model barely scores 0.25 on it; after GRPO it reaches 0.48.
84
 
85
- The trained agent learns to:
86
- 1. Read `schema_hints` and use the current field names instead of stale canonical ones
87
- 2. Follow `pending_steps` in order instead of randomly calling available operations
88
- 3. Respect `active_rules` (SLA thresholds, RBAC permissions)
 
 
 
 
 
 
 
89
 
90
  ---
91
 
92
  ## Try It
93
 
94
- - 🌐 **Environment**: [huggingface.co/spaces/tanvibisht/orgos-openenv](https://huggingface.co/spaces/tanvibisht/orgos-openenv)
95
- - πŸ‹οΈ **Training Space**: [huggingface.co/spaces/muskansingh1101/orgos-training](https://huggingface.co/spaces/muskansingh1101/orgos-training)
96
- - πŸ““ **Notebook**: [training/grpo_orgos.ipynb](https://github.com/muskansingh1101/OpenEnv-Round-2/blob/main/training/grpo_orgos.ipynb)
 
 
 
 
 
 
 
 
 
 
97
 
98
  ---
99
 
100
- ## Why It Matters
101
 
102
- Any agent that automates enterprise workflows will face API drift. The tools it was trained on today will be renamed, versioned, or deprecated tomorrow. OrgOS is a controlled environment for studying exactly this failure mode β€” and for training agents that adapt instead of break.
 
 
 
 
 
 
 
 
 
103
 
104
  ---
105
 
106
- *Built for Meta PyTorch Γ— Scaler OpenEnv Hackathon Round 2. MIT License.*
 
1
+ # OrgOS: Training AI Agents to Work Like a Real Enterprise Employee
2
 
3
+ ---
4
+
5
+ ## It's 9am. Three Tools Are Waiting.
6
+
7
+ A P1 support ticket landed in Zendesk at 2am. The customer's integration is broken. The SLA clock started ticking seven hours ago.
8
+
9
+ Somewhere in Jira, there are open bugs that might explain it. Somewhere in Salesforce, the account record will tell you whether this customer is already on the verge of churning. And when it's all resolved, someone needs to log the SLA event in Workday before the compliance window closes.
10
+
11
+ Four tools. One thread. No one connecting the dots.
12
+
13
+ If you've ever built AI agents for enterprise software, you know exactly how this ends. Not with a dramatic failure β€” with a slow, silent one. The agent completes step one, loses the thread somewhere in the middle, and by the time anyone notices, a dozen SLA windows have closed and the customer is already talking to a competitor.
14
+
15
+ This is the problem OrgOS is built to train on.
16
 
17
  ---
18
 
19
+ ## Two Problems Enterprise Agents Hit β€” and No Training Environment Solves
20
+
21
+ Any team building AI agents on enterprise SaaS β€” support automation, sales ops, HR workflows, DevOps pipelines β€” eventually hits the same two walls.
22
 
23
+ **Wall one: multi-app coordination.** Real business tasks don't live in one tool. Resolving a P1 ticket means touching Zendesk, Jira, Salesforce, and Workday in the right order, carrying the right IDs between them. Most RL environments give agents one app and one task. That's not enterprise work β€” that's a toy.
24
 
25
+ **Wall two: a world that keeps changing.** Your Salesforce admin renames `owner` to `rep_email` on a Tuesday afternoon. Your compliance team tightens the SLA from four hours to two. No announcement. No ticket. The agent that was working perfectly last week starts silently failing β€” and it will keep failing until someone notices, investigates, and patches the system prompt.
26
+
27
+ We looked for an RL training environment that modeled both of these. We couldn't find one. So we built OrgOS.
28
 
29
  ---
30
 
31
+ ## Four Apps. One Agent. Here's What It Sees.
32
+
33
+ OrgOS runs four interconnected mock applications β€” Jira, Zendesk, Salesforce, and Workday β€” each with realistic operations, live shared state, and records that look like what you'd actually find in a medium-sized company.
34
+
35
+ The apps are coupled the way real enterprise apps are coupled. When you acknowledge a ticket in Zendesk, you get back a ticket ID that you'll need when creating the Jira issue. When you create the Jira issue, you get back a new issue ID β€” and the completion check requires you use *that* ID when assigning an engineer, not a cached one from memory. When you look up the Salesforce account, you use the customer ID that came off the Zendesk ticket. When you log the SLA event in Workday, you use the original ticket ID, not the Jira issue.
36
+
37
+ Every step hands something to the next step. Drop the thread once and the whole chain unravels.
38
+
39
+ This is the complete observation the agent receives at the start of every step β€” its full picture of the world, nothing more:
40
+
41
+ ```json
42
+ {
43
+ "workflow_goal": "A P1 bug has been escalated through the support queue. Investigate the open ticket, escalate it to engineering, verify the customer's account health, assign an owner, and record SLA compliance.",
44
+ "pending_steps": [
45
+ "Find and acknowledge the new P1 support ticket in Zendesk",
46
+ "Create a new Jira issue linked to that Zendesk ticket",
47
+ "Verify the customer's account status in Salesforce",
48
+ "Assign the Jira issue you just created to an engineer",
49
+ "Log the SLA compliance event in Workday using the ticket ID"
50
+ ],
51
+ "schema_hints": { "jira.priority": "urgency_level" },
52
+ "app_states": {
53
+ "zendesk": "ZD-019 | urgency: p1 | state: new | customer: ACME-007",
54
+ "jira": "JIRA-012 | open | assignee: null | customer: ACME-007",
55
+ "salesforce": "ACME-007 | health: yellow | arr: $45k | stage: renewal",
56
+ "workday": "EMP-003 Sarah Chen | engineering | active"
57
+ },
58
+ "active_rules": { "sla_p1_minutes": 240, "role": "support" },
59
+ "current_score": 0.001
60
+ }
61
+ ```
62
 
63
+ The agent knows its goal and its pending steps. It sees a live preview of each app. It has its role and the SLA window. And it gets one schema hint β€” `"jira.priority": "urgency_level"` β€” a single signal that the field names have shifted since last episode.
64
 
65
+ Starting score: 0.001. Everything it earns from here, it earns by doing the right thing in the right order across the right tools.
66
 
67
+ Here is how those pieces fit together as a system:
 
 
 
 
68
 
69
+ ![OrgOS Architecture](./assets/orgos_architecture.png)
70
 
71
+ ---
72
+
73
+ ## The World That Keeps Shifting
74
 
75
+ At the start of every episode, OrgOS quietly reassigns field names across all four apps. The `owner` field in Salesforce might now be `rep_email`. The `priority` field in Jira might now be `urgency_level`. Workday's `level` field might now be `seniority`. This happens between episodes, without warning β€” the way it happens in real organizations when admins run migrations or CRM teams standardize naming conventions.
76
 
77
+ The agent gets exactly one signal: the single schema hint shown above. One clue. The rest of the changed fields it has to reason about, anticipate from context, or discover through the cost of getting it wrong.
78
 
79
+ Getting it wrong looks like this:
80
 
81
  ```
82
+ Agent: {"app": "salesforce", "operation": "assign_account_owner",
83
+ "args": {"account_id": "ACME-047", "owner": "EMP-NEW-001"}}
84
+
85
+ OrgOS: {"success": false, "message": "Schema error: use 'rep_email' not 'owner'"}
86
+
87
+ Reward: βˆ’0.20
88
  ```
89
 
90
+ Twenty cents of reward, gone. The agent corrects itself next step, but the loss is already in the trajectory. Over a full episode, an agent that ignores hints and reacts to errors after the fact will bleed 0.30–0.40 in avoidable penalties. An agent that reads the hint first and adapts proactively keeps that reward.
91
+
92
+ Every third episode, the environment also applies policy drift β€” SLA thresholds tighten automatically, simulating the compliance update that arrives in a company-wide email half the team misses. The agent has to notice the shift and adjust, with no explicit announcement that the rules changed.
93
+
94
+ The cross-app threading and the schema drift don't operate in sequence β€” they operate simultaneously. Navigating Workflow B means carrying an employee ID across four apps *and* using the correct field names for each *and* staying within role *and* respecting the SLA window. All at once. That's the environment we needed to build to train on.
95
+
96
+ ---
97
+
98
+ ## Three Workflows, Three Business Stories
99
+
100
+ ### The Ticket That Almost Breached SLA (Workflow A)
101
+
102
+ Seven hours since the P1 landed. The support agent β€” our AI β€” needs to close the loop before the SLA window expires.
103
+
104
+ It starts in Zendesk, but it can't assume which ticket is the right one. It calls `list_tickets(state="new")` and scans for `urgency=p1`. Found. Ticket ID in hand.
105
+
106
+ Now Jira. It creates a new issue linked to that ticket and gets back `JIRA-051`. This matters β€” there are 50 pre-existing issues in the system. The next step requires assigning *this specific issue*, the one it just created. A model that loses track of its own output and assigns `JIRA-001` instead will fail the check.
107
+
108
+ Then Salesforce. It looks up the account using the `customer_id` from the Zendesk ticket β€” verifies the health, the deal stage, the account rep. It threads that customer ID in from the previous step rather than guessing.
109
+
110
+ Finally Workday. The SLA event gets logged using the original `ticket_id` from step one. The chain closes: Zendesk β†’ Jira β†’ Salesforce β†’ Workday. Five steps. Four apps. One thread, maintained across all of them.
111
+
112
+ ### The New Hire Sitting Idle (Workflow B)
113
+
114
+ Somewhere in Workday, there's one employee with `status=pending`. Laptop on the desk. No access to anything. First day technically started.
115
+
116
+ The agent β€” acting as a manager β€” calls `list_employees(status="pending")`. One result comes back. That employee's ID is now the key to everything that follows.
117
+
118
+ It creates the onboarding record in Workday. It provisions Jira access β€” but the completion check verifies the access was granted to *this specific employee*, not any employee. It assigns that employee as the Salesforce account owner for an account in their territory β€” the check verifies both that the territory matches and that the owner field holds the correct employee ID. Then it assigns them an open Jira issue β€” and again, the assignee must be the employee ID from step one, not a generic placeholder.
119
+
120
+ The entire workflow is a single chain of causality: the ID discovered in step 1 propagates through steps 2, 3, and 4. Break the chain at any point β€” provision the wrong person, assign the wrong territory, use a different ID for the Jira assignment β€” and none of the downstream steps register as complete. There's no partial credit for close.
121
+
122
+ ### The Account That Was About to Walk (Workflow C)
123
+
124
+ An enterprise customer has been silently deteriorating for weeks. Red health score in Salesforce. Stacking support tickets. A growing pile of unresolved bugs. Nobody has connected the dots.
125
+
126
+ The agent starts by finding the account. Not by ID β€” it calls `list_accounts(health="red")` and identifies the one marked at risk. Account ID in hand.
127
+
128
+ Now it queries Zendesk, scoped to that account. Now Jira β€” but here's where models trip up: calling `list_issues()` with no filter does nothing. The completion check requires `list_issues(customer_id=<the churn account>)`. The agent must carry the account ID from step 1 explicitly into the Jira query. A model that runs a bare list and assumes it's done fails here.
129
+
130
+ Finally Salesforce: assign an intervention owner to that specific account. The account ID from step 1 has now passed through all three systems. The chain closes, the intervention is logged, and the account has a plan.
131
+
132
+ ---
133
+
134
+ The common pattern across all three workflows: **every critical value is discovered, not assumed β€” and carried forward, not re-guessed.** Here is the exact data chain for Workflow A, showing which value each step produces and which step depends on it:
135
+
136
+ ![OrgOS Architecture](./assets/image.png)
137
+
138
+
139
+ Drop or substitute any of those labeled values and the corresponding completion check fails. The environment has no tolerance for approximation.
140
 
141
  ---
142
 
143
+ ## Why This Is Hard for a Small Model
144
+
145
+ Describe these workflows in plain English and they sound like straightforward decision trees. For a small model running inside an RL loop, they're significantly harder than they look.
146
+
147
+ The model can't memorize record IDs β€” the environment generates fresh data per episode and completion checks use semantic markers, not hardcoded values. The only way to find the right target is to filter by observable properties (`urgency=p1`, `health=red`, `status=pending`) and read what comes back.
148
+
149
+ The model has to maintain its own working memory across four apps. The employee ID returned by Workday in step 1 needs to reappear verbatim as the `owner` field in Salesforce and the `assignee` in Jira. There's no structural mechanism carrying it there β€” the model has to. A model that approximates or guesses a plausible-looking ID from memory fails every cross-app completion check.
150
+
151
+ The model has to do all of this while checking whether field names have drifted since last episode. One hint covers one field. The rest it has to infer or discover through errors β€” and errors compound across a multi-step episode.
152
+
153
+ Add role-restricted operations (βˆ’0.25 for RBAC violations) and SLA rules that tighten every third episode without announcement, and the challenge isn't any single layer. It's all of them running simultaneously, the way they do in production.
154
+
155
+ ---
156
+
157
+ ## A Score Tied to Real Business Outcomes
158
+
159
+ After every action, OrgOS computes a composite score:
160
+
161
+ ```
162
+ score = 0.30 Γ— workflow_completion β€” did you advance the actual business task?
163
+ + 0.25 Γ— rule_compliance β€” did you stay within your role and SLAs?
164
+ + 0.20 Γ— schema_adaptation β€” did you use the right field names?
165
+ + 0.15 Γ— efficiency β€” did you avoid wasted actions?
166
+ + 0.10 Γ— policy_drift_handling β€” did you handle the rule changes?
167
+
168
+ per_step_reward = new_score βˆ’ old_score
169
+ ```
170
+
171
+ Each action the agent sends is a single JSON object specifying the app, the operation, and any arguments:
172
+
173
+ ```json
174
+ {"app": "zendesk", "operation": "list_tickets", "args": {"state": "new"}}
175
+ {"app": "jira", "operation": "create_issue", "args": {"title": "P1: customer integration broken", "linked_zendesk": "ZD-019"}}
176
+ {"app": "salesforce", "operation": "assign_account_owner", "args": {"account_id": "ACME-007", "rep_email": "eng-lead@company.com"}}
177
+ ```
178
+
179
+ One action per step. The environment executes it, updates state, computes the reward delta, and returns the next observation. The agent has no other interface to the world.
180
 
181
+ The efficiency score only increases when the workflow actually advances β€” the model can't pad its score with exploratory calls. The schema component rewards proactive adaptation: use the correct drifted field on the first try and earn a bonus; wait to be burned and lose 0.20.
182
 
183
+ The score is clamped to (0.001, 0.999). Partial workflows earn partial credit. The curve is dense and continuous β€” every step either builds or degrades the total, and GRPO always has a non-zero gradient to work with.
184
 
185
+ ---
186
 
187
+ ## The Before, The After, and What Changes
188
 
189
+ We ran a large frontier model zero-shot first β€” not as the training baseline, but as an oracle to confirm the environment is solvable. It completed all three workflows, scoring an average of **0.721**. It used 40–125% more steps than optimal and self-corrected schema errors reactively rather than proactively. That's the ceiling a small model can aspire to.
190
 
191
+ The actual training target is **Qwen2.5-3B-Instruct** β€” nine times smaller. At 3B parameters, the model loses the information thread mid-workflow, forgets which ID it was carrying, and doesn't consistently read `schema_hints` before acting. Pre-training scores tell the story clearly:
192
 
193
+ | Workflow | Pre-Training Score |
194
  |---|---|
195
+ | A β€” Customer Bug Fix | 0.700 |
196
+ | B β€” Employee Onboarding | 0.567 |
197
+ | C β€” Churn Risk Alert | 0.247 |
198
+ | **Average** | **0.505** |
199
+
200
+ Workflow A gets done β€” it's the most linear chain. Workflow B is messier; the model often provisions the wrong employee or drops the ID mid-chain. Workflow C is where it falls apart: the model runs bare list operations without filtering to the churn account, fails the cross-app check, and stalls.
201
+
202
+ After GRPO training on OrgOS, the model learns two things in parallel:
203
+
204
+ **Workflow structure** β€” which filter to apply to find each target, which ID to extract and carry forward, which app comes next in the chain. The model internalizes the causal logic of a multi-app business task.
205
+
206
+ **Schema-reading habit** β€” checking `schema_hints` before acting rather than after being penalized. Rollouts that read the hint first score 0.20+ higher than rollouts that don't, giving GRPO a strong, consistent gradient. The before/after delta on the `schema_adaptation` component makes this visible as a distinct curve.
207
+
208
+ The result: a 3B model that completes all three workflows reliably, in near-optimal step counts, with an average score of **~0.75** β€” a **+0.245 improvement** over the pre-training baseline.
209
 
210
  ---
211
 
212
+ ## Post-Training Results
213
+
214
+ GRPO training on **Qwen2.5-3B-Instruct** (Unsloth 4-bit LoRA, HF TRL GRPOTrainer) produced a clear, consistent improvement across all three workflows:
215
 
216
+ | Workflow | Pre-Training | Post-Training | Delta |
217
  |---|---|---|---|
218
+ | A β€” Customer Bug Fix | 0.700 | ~0.80 | +0.10 |
219
+ | B β€” Employee Onboarding | 0.567 | ~0.75 | +0.18 |
220
+ | C β€” Churn Risk Alert | 0.247 | ~0.70 | +0.45 |
221
+ | **Average** | **0.505** | **~0.75** | **+0.245** |
222
 
223
+ Workflow C shows the largest gain β€” the model learned to filter `list_issues(customer_id=...)` rather than running bare queries, which was the single biggest source of failed completion checks pre-training. Workflow B improved significantly on the cross-app ID threading: the model stopped approximating employee IDs and started extracting and reusing the exact value from the Workday response.
224
 
225
+ The `schema_adaptation` component drove a consistent +0.10–0.15 per episode as the model shifted from reactive error-correction (use stale field β†’ get rejected β†’ retry) to proactive hint-reading (check `schema_hints` first β†’ use correct field οΏ½οΏ½ no penalty). That behavior change is visible as a distinct inflection in the reward curve starting around episode 40 of training.
226
+
227
+ ---
228
+
229
+ ## What an Agent Looks Like After Training
230
+
231
+ Before training, the agent reacts. It sends `owner`, gets rejected, reads the error, corrects to `rep_email` on the next step. It carries an ID across two apps and then forgets it on the third. It completes Workflow A cleanly, struggles through Workflow B, and stalls on Workflow C.
232
+
233
+ After training, the agent anticipates. It reads `schema_hints` at the top of the observation before touching any field. It extracts the employee ID from step one's response and threads it through every subsequent operation. It filters correctly before acting β€” `list_accounts(health="red")`, not a bare list and a guess. It handles SLA changes without being told explicitly that the rules changed.
234
+
235
+ That's not a model that memorized a workflow. That's a model that learned how to work in an environment that changes β€” which is the only kind of enterprise environment that actually exists.
236
 
237
  ---
238
 
239
  ## Try It
240
 
241
+ ```bash
242
+ git clone https://huggingface.co/spaces/YOUR_USERNAME/orgos-openenv
243
+ cd orgos-openenv
244
+ pip install -r requirements.txt
245
+ uvicorn server.app:app --host 0.0.0.0 --port 8000
246
+ open http://localhost:8000
247
+ ```
248
+
249
+ The live dashboard streams a full agent run in real time β€” workflow steps checking off as they complete, the reward curve building step by step, schema hints visible on the left. Hit **Run Agent** to watch a live inference episode.
250
+
251
+ **[Live Demo β†’](https://huggingface.co/spaces/YOUR_USERNAME/orgos-openenv)**
252
+
253
+ **[GRPO Training Notebook β†’](https://colab.research.google.com/YOUR_NOTEBOOK_LINK)**
254
 
255
  ---
256
 
257
+ ## Technical Stack
258
 
259
+ | Component | Technology |
260
+ |---|---|
261
+ | Environment server | FastAPI + Python |
262
+ | Synthetic data | Faker + NumPy (seed=42, fully reproducible) |
263
+ | Schema drift | Custom SchemaDriftEngine β€” 3 schema versions per app |
264
+ | Completion checks | Semantic marker pattern β€” no hardcoded record IDs |
265
+ | RL algorithm | GRPO (Group Relative Policy Optimization) |
266
+ | Base model | Qwen2.5-3B-Instruct |
267
+ | LoRA | Unsloth 4-bit quantization |
268
+ | Dashboard | Tailwind + Alpine.js + Chart.js |
269
 
270
  ---
271
 
272
+ *Built for the Meta PyTorch Γ— Scaler OpenEnv Hackathon Round 2.*