YUS200619 commited on
Commit
562f58d
·
1 Parent(s): fbe9c8c

feat: complete invoice exception handler v1.0.0

Browse files
.gitignore ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # Virtual env
7
+ venv/
8
+ .venv/
9
+ env_venv/
10
+
11
+ # IDE
12
+ .vscode/
13
+ .idea/
14
+ *.swp
15
+
16
+ # OS
17
+ .DS_Store
18
+ Thumbs.db
19
+
20
+ # Secrets
21
+ .env
22
+ *.env
23
+
24
+ # Test files
25
+ test_smoke.py
26
+
27
+ # Misc
28
+ *.egg-info/
29
+ dist/
30
+ build/
Dockerfile ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ # Install system dependencies
4
+ RUN apt-get update \
5
+ && apt-get install -y --no-install-recommends curl \
6
+ && rm -rf /var/lib/apt/lists/*
7
+
8
+ # Create non-root user (required by HF Spaces)
9
+ RUN useradd -m -u 1000 appuser
10
+
11
+ WORKDIR /app
12
+
13
+ # Copy and install dependencies first (layer caching)
14
+ COPY requirements.txt .
15
+ RUN pip install --no-cache-dir -r requirements.txt
16
+
17
+ # Copy application code
18
+ COPY --chown=appuser:appuser . .
19
+
20
+ USER appuser
21
+
22
+ EXPOSE 7860
23
+
24
+ # Health check — pings the /health endpoint
25
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=20s --retries=3 \
26
+ CMD curl -f http://localhost:7860/health || exit 1
27
+
28
+ ENV PYTHONUNBUFFERED=1
29
+ ENV GRADIO_SERVER_NAME=0.0.0.0
30
+ ENV GRADIO_SERVER_PORT=7860
31
+
32
+ CMD ["python", "app.py"]
app.py ADDED
@@ -0,0 +1,405 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Combined FastAPI + Gradio application for the Invoice Exception Handler.
3
+
4
+ Serves both the HTTP API endpoints (for the OpenEnv validator) and an
5
+ interactive Gradio UI (for judges and exploration) on port 7860.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ from typing import Any, Dict, Optional
11
+
12
+ import gradio as gr
13
+ import uvicorn
14
+ from fastapi import FastAPI
15
+ from fastapi.responses import JSONResponse
16
+
17
+ from env import InvoiceExceptionEnv, Action, ActionType, ALL_TASKS
18
+
19
+ # ---------------------------------------------------------------------------
20
+ # Shared environment instance
21
+ # ---------------------------------------------------------------------------
22
+
23
+ env = InvoiceExceptionEnv(seed=42)
24
+
25
+ # ---------------------------------------------------------------------------
26
+ # FastAPI server
27
+ # ---------------------------------------------------------------------------
28
+
29
+ api = FastAPI(title="Invoice Exception Handler OpenEnv", version="1.0.0")
30
+
31
+
32
+ @api.post("/reset")
33
+ async def http_reset(body: dict = {}) -> JSONResponse:
34
+ """Reset the environment. Optionally specify task_id."""
35
+ task_id = body.get("task_id", None)
36
+ obs = env.reset(task_id)
37
+ return JSONResponse(obs.model_dump(mode="json"))
38
+
39
+
40
+ @api.post("/step")
41
+ async def http_step(body: dict = {}) -> JSONResponse:
42
+ """Execute one action."""
43
+ result = env.step(body)
44
+ return JSONResponse(result.model_dump(mode="json"))
45
+
46
+
47
+ @api.get("/state")
48
+ async def http_state() -> JSONResponse:
49
+ """Return the current state without advancing."""
50
+ return JSONResponse(env.state().model_dump(mode="json"))
51
+
52
+
53
+ @api.post("/grade")
54
+ async def http_grade() -> JSONResponse:
55
+ """Grade the current episode."""
56
+ return JSONResponse(env.grade())
57
+
58
+
59
+ @api.get("/tasks")
60
+ async def http_tasks() -> JSONResponse:
61
+ """List available tasks."""
62
+ return JSONResponse(ALL_TASKS)
63
+
64
+
65
+ @api.get("/health")
66
+ async def health() -> JSONResponse:
67
+ """Health check endpoint."""
68
+ return JSONResponse({"status": "ok", "version": "1.0.0"})
69
+
70
+
71
+ # ---------------------------------------------------------------------------
72
+ # Gradio UI — environment for interactive play
73
+ # ---------------------------------------------------------------------------
74
+
75
+ # Per-session environment for the Gradio UI (separate from the API env)
76
+ ui_env = InvoiceExceptionEnv(seed=42)
77
+ ui_history: list = []
78
+
79
+
80
+ def reset_task(task_name: str) -> tuple:
81
+ """Reset the environment with the selected task."""
82
+ global ui_history
83
+ ui_history = []
84
+
85
+ task_map = {
86
+ "Task 1 — Price Variance (Easy)": "task1_price_variance",
87
+ "Task 2 — Duplicate Tax (Medium)": "task2_duplicate_tax",
88
+ "Task 3 — Compound Fraud (Hard)": "task3_compound_fraud",
89
+ }
90
+ task_id = task_map.get(task_name, "task1_price_variance")
91
+ obs = ui_env.reset(task_id)
92
+
93
+ flag_text = f"**{obs.exception_flag.flag_code}**: {obs.exception_flag.flag_description}"
94
+ checks_text = ", ".join(obs.available_checks)
95
+ rules_text = ", ".join(obs.available_rules)
96
+ kb_text = "\n".join(f"- {entry}" for entry in obs.knowledge_base)
97
+ status_text = f"Step: {obs.step_number} | Status: {obs.case_status.value} | Reward: {obs.cumulative_reward:.2f}"
98
+
99
+ return flag_text, checks_text, rules_text, kb_text, status_text, "", ""
100
+
101
+
102
+ def execute_action(action_type: str, param1: str, param2: str, param3: str) -> tuple:
103
+ """Execute a single action and return updated state."""
104
+ global ui_history
105
+
106
+ params: Dict[str, Any] = {}
107
+ if action_type == "inspect_field":
108
+ params = {"document": param1, "field": param2}
109
+ elif action_type == "cross_check":
110
+ params = {"field": param1, "doc_a": param2, "doc_b": param3}
111
+ elif action_type == "run_check":
112
+ params = {"check_name": param1}
113
+ elif action_type == "query_supplier":
114
+ params = {"question": param1, "channel": param2 or "phone"}
115
+ elif action_type == "query_internal":
116
+ params = {"department": param1, "question": param2}
117
+ elif action_type == "apply_rule":
118
+ params = {"rule_id": param1}
119
+ elif action_type == "make_decision":
120
+ params = {"decision": param1, "reason": param2}
121
+ elif action_type == "route_to":
122
+ params = {"team": param1, "notes": param2}
123
+ elif action_type == "close_case":
124
+ params = {"summary": param1}
125
+
126
+ try:
127
+ result = ui_env.step({"type": action_type, "params": params})
128
+ reward_text = f"**Reward:** {result.reward:+.2f}"
129
+ info_text = json.dumps(result.info, indent=2, default=str)
130
+
131
+ obs = result.observation
132
+ status_text = (
133
+ f"Step: {obs.step_number} | Status: {obs.case_status.value} | "
134
+ f"Reward: {obs.cumulative_reward:.2f} | Done: {result.done}"
135
+ )
136
+
137
+ ui_history.append(f"Step {obs.step_number}: {action_type}({param1}) → {result.reward:+.2f}")
138
+ history_text = "\n".join(ui_history)
139
+
140
+ grade_text = ""
141
+ if result.done:
142
+ scores = ui_env.grade()
143
+ grade_lines = [f"**Final Grade: {scores['score']:.4f}**", ""]
144
+ for k, v in scores.items():
145
+ if k != "score":
146
+ grade_lines.append(f"- {k}: {v}")
147
+ grade_text = "\n".join(grade_lines)
148
+
149
+ return reward_text, status_text, history_text, info_text, grade_text
150
+
151
+ except Exception as e:
152
+ return f"**Error:** {str(e)}", "", "\n".join(ui_history), "", ""
153
+
154
+
155
+ def run_demo(task_name: str) -> str:
156
+ """Run a hardcoded optimal sequence and show step-by-step results."""
157
+ task_map = {
158
+ "Task 1 — Price Variance (Easy)": "task1_price_variance",
159
+ "Task 2 — Duplicate Tax (Medium)": "task2_duplicate_tax",
160
+ "Task 3 — Compound Fraud (Hard)": "task3_compound_fraud",
161
+ }
162
+ task_id = task_map.get(task_name, "task1_price_variance")
163
+
164
+ # Optimal action sequences for each task
165
+ sequences = {
166
+ "task1_price_variance": [
167
+ Action.run_check("po_match"),
168
+ Action.run_check("tolerance_rule"),
169
+ Action.cross_check("unit_price", "invoice", "po"),
170
+ Action.run_check("grn_match"),
171
+ Action.query_supplier("Why do prices differ from PO?", "email"),
172
+ Action.query_internal("procurement", "Did you approve the price increase?"),
173
+ Action.apply_rule("tolerance_exception_approval"),
174
+ Action.make_decision("approve", "Price increase verbally approved by procurement. PO amendment pending."),
175
+ Action.route_to("procurement", "Please raise PO amendment for the price variance."),
176
+ Action.close_case("Invoice approved. Procurement confirmed verbal approval. PO amendment requested."),
177
+ ],
178
+ "task2_duplicate_tax": [
179
+ Action.run_check("duplicate_detection"),
180
+ Action.inspect_field("invoice", "invoice_number"),
181
+ Action.run_check("tax_calculation_verify"),
182
+ Action.cross_check("tax_amount", "invoice", "payment_history"),
183
+ Action.query_internal("finance", "Can you confirm the overpayment on INV-2024-819?"),
184
+ Action.query_supplier("Please clarify the relationship between INV-2024-891 and INV-2024-819.", "email"),
185
+ Action.apply_rule("partial_approval"),
186
+ Action.apply_rule("credit_note_request"),
187
+ Action.make_decision("partial_approve", "Duplicate detected. Tax error on original. Approve only 3,240 INR correction."),
188
+ Action.route_to("finance", "Process 3,240 INR tax correction entry."),
189
+ Action.close_case("Duplicate invoice with tax correction. Partial approval for delta only."),
190
+ ],
191
+ "task3_compound_fraud": [
192
+ Action.inspect_field("invoice", "bank_account"),
193
+ Action.run_check("bank_account_verification"),
194
+ Action.run_check("email_domain_verification"),
195
+ Action.inspect_field("invoice", "supplier_gstin"),
196
+ Action.run_check("gst_verification"),
197
+ Action.inspect_field("grn", "items_received"),
198
+ Action.run_check("grn_match"),
199
+ Action.run_check("price_check"),
200
+ Action.query_supplier("Please confirm your bank details and recent invoices.", "phone"),
201
+ Action.query_internal("security", "Suspected BEC attack — lookalike domain detected."),
202
+ Action.apply_rule("fraud_hold"),
203
+ Action.make_decision("reject", "Four fraud signals: bank BEC, GSTIN mismatch, quantity mismatch, price inflation."),
204
+ Action.route_to("legal", "Initiate supplier audit and fraud investigation."),
205
+ Action.route_to("security", "BEC investigation — lookalike domain techcore-solutions.com."),
206
+ Action.close_case("Fraud detected. Invoice rejected. Legal and security notified."),
207
+ ],
208
+ }
209
+
210
+ demo_env = InvoiceExceptionEnv(seed=42)
211
+ obs = demo_env.reset(task_id)
212
+ actions = sequences.get(task_id, [])
213
+
214
+ lines = [f"# Demo: {task_name}", f"**Flag:** {obs.exception_flag.flag_description}", ""]
215
+
216
+ for idx, action in enumerate(actions, 1):
217
+ try:
218
+ result = demo_env.step(action)
219
+ action_desc = f"{action.type.value}({json.dumps(action.params)})"
220
+ lines.append(f"**Step {idx}:** `{action_desc}`")
221
+ lines.append(f" Reward: {result.reward:+.2f} | Cumulative: {result.observation.cumulative_reward:.2f}")
222
+
223
+ if result.info.get("result"):
224
+ detail = result.info["result"].get("detail", result.info["result"].get("value", ""))
225
+ if detail:
226
+ lines.append(f" → {str(detail)[:120]}")
227
+ elif result.info.get("detail"):
228
+ lines.append(f" → {str(result.info['detail'])[:120]}")
229
+
230
+ lines.append("")
231
+ if result.done:
232
+ break
233
+ except Exception as e:
234
+ lines.append(f" Error: {e}")
235
+ lines.append("")
236
+
237
+ scores = demo_env.grade()
238
+ lines.append("---")
239
+ lines.append(f"## Final Score: {scores['score']:.4f}")
240
+ for k, v in scores.items():
241
+ if k != "score" and k != "signals_found":
242
+ lines.append(f"- {k}: {v}")
243
+ if "signals_found" in scores:
244
+ lines.append(f"- signals_found: {scores['signals_found']}")
245
+
246
+ return "\n".join(lines)
247
+
248
+
249
+ def build_gradio_ui() -> gr.Blocks:
250
+ """Build the three-tab Gradio interface."""
251
+
252
+ with gr.Blocks(
253
+ title="Invoice Exception Handler — OpenEnv",
254
+ theme=gr.themes.Soft(),
255
+ ) as demo:
256
+ gr.Markdown("# 🧾 Invoice Exception Handler — OpenEnv")
257
+ gr.Markdown("An AI agent learning environment for accounts payable exception handling.")
258
+
259
+ with gr.Tabs():
260
+ # ----- Tab 1: Manual Play -----
261
+ with gr.TabItem("🎮 Manual Play"):
262
+ with gr.Row():
263
+ task_dropdown = gr.Dropdown(
264
+ choices=[
265
+ "Task 1 — Price Variance (Easy)",
266
+ "Task 2 — Duplicate Tax (Medium)",
267
+ "Task 3 — Compound Fraud (Hard)",
268
+ ],
269
+ value="Task 1 — Price Variance (Easy)",
270
+ label="Select Task",
271
+ )
272
+ reset_btn = gr.Button("🔄 Reset", variant="primary")
273
+
274
+ flag_display = gr.Markdown(label="Exception Flag")
275
+ with gr.Row():
276
+ checks_display = gr.Textbox(label="Available Checks", interactive=False)
277
+ rules_display = gr.Textbox(label="Available Rules", interactive=False)
278
+ kb_display = gr.Markdown(label="Knowledge Base")
279
+ status_display = gr.Textbox(label="Status", interactive=False)
280
+
281
+ gr.Markdown("### Take an Action")
282
+ with gr.Row():
283
+ action_type_input = gr.Dropdown(
284
+ choices=[at.value for at in ActionType],
285
+ value="run_check",
286
+ label="Action Type",
287
+ )
288
+ param1_input = gr.Textbox(label="Param 1 (check_name / document / field / question / decision / team / summary)")
289
+ param2_input = gr.Textbox(label="Param 2 (field / channel / department / reason / notes)")
290
+ param3_input = gr.Textbox(label="Param 3 (doc_b, if cross_check)")
291
+
292
+ action_btn = gr.Button("▶️ Execute Action", variant="primary")
293
+
294
+ reward_display = gr.Markdown(label="Reward")
295
+ action_info = gr.Textbox(label="Action Info (JSON)", lines=4, interactive=False)
296
+ history_display = gr.Textbox(label="Action History", lines=8, interactive=False)
297
+ grade_display = gr.Markdown(label="Grade (shown when episode ends)")
298
+
299
+ reset_btn.click(
300
+ reset_task,
301
+ inputs=[task_dropdown],
302
+ outputs=[flag_display, checks_display, rules_display,
303
+ kb_display, status_display, history_display, grade_display],
304
+ )
305
+ action_btn.click(
306
+ execute_action,
307
+ inputs=[action_type_input, param1_input, param2_input, param3_input],
308
+ outputs=[reward_display, status_display, history_display,
309
+ action_info, grade_display],
310
+ )
311
+
312
+ # ----- Tab 2: Agent Demo -----
313
+ with gr.TabItem("🤖 Agent Demo"):
314
+ gr.Markdown("Watch a hardcoded optimal agent solve each task step by step.")
315
+ demo_task = gr.Dropdown(
316
+ choices=[
317
+ "Task 1 — Price Variance (Easy)",
318
+ "Task 2 — Duplicate Tax (Medium)",
319
+ "Task 3 — Compound Fraud (Hard)",
320
+ ],
321
+ value="Task 1 — Price Variance (Easy)",
322
+ label="Select Task",
323
+ )
324
+ demo_btn = gr.Button("▶️ Run Demo", variant="primary")
325
+ demo_output = gr.Markdown()
326
+ demo_btn.click(run_demo, inputs=[demo_task], outputs=[demo_output])
327
+
328
+ # ----- Tab 3: API Reference -----
329
+ with gr.TabItem("📖 API Reference"):
330
+ gr.Markdown("""
331
+ ## Action Types
332
+
333
+ | Action | Params | Description |
334
+ |--------|--------|-------------|
335
+ | `inspect_field` | `document, field` | Look at a specific field in a document |
336
+ | `cross_check` | `field, doc_a, doc_b` | Compare a field between two documents |
337
+ | `run_check` | `check_name` | Run a named validation check |
338
+ | `query_supplier` | `question, channel` | Ask the supplier (channel: phone or email) |
339
+ | `query_internal` | `department, question` | Ask an internal team |
340
+ | `apply_rule` | `rule_id` | Apply a business policy rule |
341
+ | `make_decision` | `decision, reason` | approve / reject / hold / partial_approve |
342
+ | `route_to` | `team, notes` | Escalate to a team |
343
+ | `close_case` | `summary` | Close with an audit trail summary |
344
+
345
+ ## Reward Ranges
346
+
347
+ | Event | Reward |
348
+ |-------|--------|
349
+ | Inspecting a key field | +0.01 to +0.14 |
350
+ | Cross-check finds mismatch | +0.12 to +0.15 |
351
+ | Running a diagnostic check | +0.08 to +0.18 |
352
+ | Correct decision | +0.18 to +0.28 |
353
+ | Wrong decision on fraud | −0.35 to −0.40 |
354
+ | Contacting supplier via email (fraud) | −0.15 |
355
+ | Repeat action | −0.02 to −0.05 |
356
+ | SLA breach | −0.10 |
357
+
358
+ ## HTTP API
359
+
360
+ ```
361
+ POST /reset — Body: {"task_id": "task1_price_variance"} → EnvironmentState
362
+ POST /step — Body: {"type": "run_check", "params": {"check_name": "..."}} → StepResult
363
+ GET /state → EnvironmentState
364
+ POST /grade → {"score": 0.85, ...}
365
+ GET /tasks → ["task1_price_variance", ...]
366
+ GET /health → {"status": "ok"}
367
+ ```
368
+
369
+ ## Grader Sub-Scores
370
+
371
+ Each task grader returns:
372
+ - **score** — overall 0.0–1.0
373
+ - **diagnosis_score** — did the agent find the root cause?
374
+ - **investigation_score** — did the agent gather evidence properly?
375
+ - **decision_score** — was the decision correct?
376
+ - **routing_score** — was the case sent to the right team?
377
+ - **closure_score** — was the case closed with a summary?
378
+ - **efficiency_score** — bonus for not wasting steps
379
+ """)
380
+
381
+ return demo
382
+
383
+
384
+ # ---------------------------------------------------------------------------
385
+ # Main — mount Gradio on FastAPI and serve
386
+ # ---------------------------------------------------------------------------
387
+
388
+ gradio_app = build_gradio_ui()
389
+ app = gr.mount_gradio_app(api, gradio_app, path="/")
390
+
391
+ if __name__ == "__main__":
392
+ import signal
393
+ import sys
394
+
395
+ def handle_sigint(sig, frame):
396
+ """Graceful shutdown on Ctrl+C."""
397
+ print("\nShutting down gracefully...")
398
+ sys.exit(0)
399
+
400
+ signal.signal(signal.SIGINT, handle_sigint)
401
+
402
+ try:
403
+ uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)
404
+ except (KeyboardInterrupt, SystemExit):
405
+ pass
documents/ARCHITECTURE.md ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Architecture
2
+
3
+ ## System Overview
4
+
5
+ ```
6
+ ┌──────────────────────────────────────────────────────────────┐
7
+ │ HF Space / Docker Container │
8
+ │ │
9
+ │ ┌──────────────┐ ┌──────────────────────────────────┐ │
10
+ │ │ Gradio UI │ │ FastAPI Server │ │
11
+ │ │ (port 7860) │ │ POST /reset GET /state │ │
12
+ │ │ │ │ POST /step GET /health │ │
13
+ │ └──────┬───────┘ └──────────────┬───────────────────┘ │
14
+ │ │ │ │
15
+ │ └──────────┬────────────────┘ │
16
+ │ │ │
17
+ │ ┌──────────▼──────────────┐ │
18
+ │ │ InvoiceExceptionEnv │ │
19
+ │ │ reset() step() state() │ │
20
+ │ │ grade() │ │
21
+ │ └──────────┬──────────────┘ │
22
+ │ │ │
23
+ │ ┌──────────▼──────────────┐ │
24
+ │ │ Task Registry │ │
25
+ │ │ task1_price_variance │ │
26
+ │ │ task2_duplicate_tax │ │
27
+ │ │ task3_compound_fraud │ │
28
+ │ └─────────────────────────┘ │
29
+ └─────────────────────────────────────────────────────────────┘
30
+ ```
31
+
32
+ ## Key Design Decisions
33
+
34
+ ### FastAPI + Gradio in same process
35
+ HF Spaces requires a single port (7860). Gradio is mounted on FastAPI using
36
+ `gr.mount_gradio_app()` so both the validator API and the interactive UI
37
+ share the same process and port.
38
+
39
+ ### Pydantic v2 for all models
40
+ Required by the OpenEnv spec. Every field is typed. No `Any` fields without
41
+ explicit documentation of why.
42
+
43
+ ### EpisodeData vs EnvironmentState
44
+ - **EpisodeData** is mutable internal state tracking what the agent has done
45
+ - **EnvironmentState** is the immutable snapshot returned to the agent
46
+ - Documents (PO, Invoice, GRN) are rebuilt from task factories each time,
47
+ ensuring they are never accidentally mutated
48
+
49
+ ### Separate task classes
50
+ Each task is a self-contained class with its own documents, simulators, and
51
+ grader. This makes it trivial to add new tasks — just implement BaseTask and
52
+ register in TASK_REGISTRY.
53
+
54
+ ### Deterministic simulation
55
+ No randomness in simulators or graders. Same seed + same actions = same scores.
56
+ The only randomness is in `action_space_sample()` for baseline agents.
documents/CHANGELOG.md ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Changelog
2
+
3
+ All changes to the Invoice Exception Handler environment are recorded here.
4
+
5
+ ---
6
+
7
+ ## [1.0.0] — 2025-01-20
8
+
9
+ ### Added
10
+ - Initial implementation of InvoiceExceptionEnv with full OpenEnv API
11
+ - Three tasks: task1_price_variance, task2_duplicate_tax, task3_compound_fraud
12
+ - Pydantic v2 typed models for all environment objects
13
+ - FastAPI HTTP endpoints for HF Spaces validation
14
+ - Gradio UI for interactive exploration
15
+ - inference.py using OpenAI client with [START][STEP][END] log format
16
+ - openenv.yaml spec file
17
+ - Dockerfile for HF Spaces deployment
18
+
19
+ ### Design decisions
20
+ - Used pure Python simulation (no external databases) for portability and determinism
21
+ - Compound fraud task has four signals to prevent simple greedy agents from scoring well
22
+ - Channel selection in Task 3 (phone vs email) tests policy knowledge, not just anomaly detection
23
+ - Grader uses sub-scores to allow partial credit for partial solutions
documents/PRD-001-product-requirements.md ADDED
@@ -0,0 +1,605 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Product Requirements Document
2
+ ## Invoice Exception Handler — OpenEnv Agent Learning Environment
3
+
4
+ **Document ID:** PRD-001
5
+ **Version:** 1.0.0
6
+ **Status:** Final
7
+ **Author:** Mohammed Yusuf Ahmed
8
+ **Last Updated:** 2025-01-20
9
+ **Classification:** Internal / Hackathon Submission
10
+
11
+ ---
12
+
13
+ ## Table of Contents
14
+
15
+ 1. [Executive Summary](#1-executive-summary)
16
+ 2. [Problem Statement](#2-problem-statement)
17
+ 3. [Product Vision](#3-product-vision)
18
+ 4. [Stakeholders](#4-stakeholders)
19
+ 5. [Functional Requirements](#5-functional-requirements)
20
+ 6. [Non-Functional Requirements](#6-non-functional-requirements)
21
+ 7. [System Architecture](#7-system-architecture)
22
+ 8. [Task Specifications](#8-task-specifications)
23
+ 9. [Reward Design](#9-reward-design)
24
+ 10. [Evaluation Criteria](#10-evaluation-criteria)
25
+ 11. [API Contract](#11-api-contract)
26
+ 12. [File Structure](#12-file-structure)
27
+ 13. [Out of Scope](#13-out-of-scope)
28
+ 14. [Change Log](#14-change-log)
29
+
30
+ ---
31
+
32
+ ## 1. Executive Summary
33
+
34
+ The Invoice Exception Handler is a real-world agent learning environment built for the OpenEnv standard. It simulates the accounts payable (AP) exception handling workflow that every business on earth runs daily — the process of investigating flagged invoices before payment is approved.
35
+
36
+ The environment places an AI agent in the role of an AP analyst. The agent receives a document packet (Purchase Order, Invoice, Goods Receipt Note, Supplier Master), reads an exception flag, and must investigate the root cause, make a decision, route the case to the right team, and close it cleanly. Every action has realistic financial and compliance consequences.
37
+
38
+ The environment ships with three tasks of increasing difficulty — price variance (easy), duplicate with hidden tax error (medium), and compound fraud with four simultaneous signals (hard).
39
+
40
+ ---
41
+
42
+ ## 2. Problem Statement
43
+
44
+ ### 2.1 The Real-World Pain
45
+
46
+ Every company that buys goods or services from suppliers receives invoices. Typically 5–15% of all invoices have exceptions — discrepancies between what was ordered (PO), what was received (GRN), and what was invoiced. These exceptions are currently handled by accounts payable clerks who manually:
47
+
48
+ 1. Pull the original Purchase Order
49
+ 2. Compare it line by line against the invoice
50
+ 3. Check the Goods Receipt Note
51
+ 4. Run validation checks
52
+ 5. Query internal teams or the supplier
53
+ 6. Make a decision (approve / reject / hold / partial approve)
54
+ 7. Route the case and document everything
55
+
56
+ At a mid-size company this is 2–4 hours of analyst time per day. At enterprise scale it is entire departments. The cost to the AP automation market exceeds $3 billion annually.
57
+
58
+ ### 2.2 The AI Gap
59
+
60
+ No existing OpenEnv benchmark tests an agent's ability to:
61
+ - Reason across multiple documents simultaneously
62
+ - Apply business rules with thresholds and exceptions
63
+ - Detect fraud signals that require cross-referencing
64
+ - Make nuanced decisions (partial approve, hold, escalate)
65
+ - Know *not* to contact a supplier via a potentially compromised channel
66
+
67
+ This gap means agents trained on existing benchmarks cannot be evaluated or trained on one of the most common finance workflows in enterprise software.
68
+
69
+ ### 2.3 What This Environment Fixes
70
+
71
+ The Invoice Exception Handler provides:
72
+ - A clean, typed, deterministic simulation of AP exception handling
73
+ - Three tasks that test a progression of reasoning: threshold logic → duplicate detection → multi-signal fraud
74
+ - Shaped rewards that signal progress at every step, not just at episode end
75
+ - A fully deployable environment that conforms to the OpenEnv spec
76
+
77
+ ---
78
+
79
+ ## 3. Product Vision
80
+
81
+ > An agent that scores well in this environment is demonstrably better at AP exception handling than the average accounts payable clerk — and is ready to be deployed in real enterprise finance workflows.
82
+
83
+ The environment is designed so that:
84
+ - The reward signal is meaningful enough to actually train agents on, not just evaluate them
85
+ - The hard task (compound fraud) remains genuinely difficult for frontier models
86
+ - Every score between 0.0 and 1.0 reflects a real quality difference in agent behavior
87
+
88
+ ---
89
+
90
+ ## 4. Stakeholders
91
+
92
+ | Stakeholder | Role | Interest |
93
+ |---|---|---|
94
+ | Hackathon Judges (Meta, HF engineers) | Evaluators | Real-world utility, code quality, creativity |
95
+ | OpenEnv Automated Validator | Gatekeeper | Spec compliance, deployment health |
96
+ | AI Researchers | Primary users post-submission | Training and evaluating AP agents |
97
+ | Enterprise Software Companies | Secondary users | Evaluating models for AP automation products |
98
+
99
+ ---
100
+
101
+ ## 5. Functional Requirements
102
+
103
+ ### 5.1 Core Environment API
104
+
105
+ | Requirement | Priority | Detail |
106
+ |---|---|---|
107
+ | FR-001 | MUST | `env.reset(task_id)` returns a clean `EnvironmentState` |
108
+ | FR-002 | MUST | `env.step(action)` returns `StepResult(observation, reward, done, info)` |
109
+ | FR-003 | MUST | `env.state()` returns current state without advancing episode |
110
+ | FR-004 | MUST | `env.grade()` returns a score dict with overall score 0.0–1.0 |
111
+ | FR-005 | MUST | All models are typed Pydantic v2 with no untyped fields |
112
+ | FR-006 | MUST | `openenv.yaml` passes `openenv validate` |
113
+
114
+ ### 5.2 HTTP Endpoints (for HF Spaces validator)
115
+
116
+ | Requirement | Priority | Detail |
117
+ |---|---|---|
118
+ | FR-007 | MUST | `POST /reset` returns HTTP 200 with JSON observation |
119
+ | FR-008 | MUST | `POST /step` returns HTTP 200 with JSON StepResult |
120
+ | FR-009 | MUST | `GET /state` returns HTTP 200 with JSON EnvironmentState |
121
+ | FR-010 | MUST | `GET /health` returns HTTP 200 `{"status": "ok"}` |
122
+ | FR-011 | SHOULD | `GET /` returns HTML documentation page |
123
+
124
+ ### 5.3 Task Requirements
125
+
126
+ | Requirement | Priority | Detail |
127
+ |---|---|---|
128
+ | FR-012 | MUST | Minimum 3 tasks with distinct scenarios |
129
+ | FR-013 | MUST | Tasks range easy → medium → hard |
130
+ | FR-014 | MUST | Each task has a deterministic grader returning 0.0–1.0 |
131
+ | FR-015 | MUST | Graders have sub-scores (diagnosis, investigation, decision, routing, closure, efficiency) |
132
+ | FR-016 | MUST | Hard task must not be solvable by simple heuristics |
133
+
134
+ ### 5.4 Reward Function
135
+
136
+ | Requirement | Priority | Detail |
137
+ |---|---|---|
138
+ | FR-017 | MUST | Reward is shaped across the full trajectory |
139
+ | FR-018 | MUST | Dangerous actions (approving fraud) produce large negative rewards |
140
+ | FR-019 | MUST | Repeating already-completed actions penalised lightly |
141
+ | FR-020 | MUST | Exceeding step budget penalised (SLA concept) |
142
+ | FR-021 | SHOULD | Efficiency bonus for completing faster than optimal |
143
+
144
+ ### 5.5 Inference Script
145
+
146
+ | Requirement | Priority | Detail |
147
+ |---|---|---|
148
+ | FR-022 | MUST | Script named exactly `inference.py` in root directory |
149
+ | FR-023 | MUST | Uses OpenAI client (not Anthropic SDK) |
150
+ | FR-024 | MUST | Reads `API_BASE_URL`, `MODEL_NAME`, `HF_TOKEN` from environment |
151
+ | FR-025 | MUST | Emits `[START]`, `[STEP]`, `[END]` lines to stdout exactly as spec |
152
+ | FR-026 | MUST | Completes all 3 tasks in under 20 minutes on 2 vCPU / 8 GB RAM |
153
+ | FR-027 | MUST | Produces reproducible scores with the same seed |
154
+
155
+ ### 5.6 Deployment
156
+
157
+ | Requirement | Priority | Detail |
158
+ |---|---|---|
159
+ | FR-028 | MUST | Dockerfile builds cleanly without internet access at run time |
160
+ | FR-029 | MUST | Container starts and serves on port 7860 |
161
+ | FR-030 | MUST | HF Spaces `POST /reset` returns 200 |
162
+ | FR-031 | MUST | README documents setup, action space, observation space, tasks, baseline scores |
163
+
164
+ ---
165
+
166
+ ## 6. Non-Functional Requirements
167
+
168
+ | ID | Category | Requirement |
169
+ |---|---|---|
170
+ | NFR-001 | Performance | `reset()` completes in < 100ms |
171
+ | NFR-002 | Performance | `step()` completes in < 50ms |
172
+ | NFR-003 | Performance | Full 3-task inference run completes in < 20 minutes |
173
+ | NFR-004 | Resource | Runs on 2 vCPU, 8 GB RAM — no GPU required |
174
+ | NFR-005 | Correctness | Grader output is deterministic — same actions = same score |
175
+ | NFR-006 | Correctness | Reward values are deterministic — no randomness in simulation |
176
+ | NFR-007 | Code quality | No bare `except:` blocks — all exceptions typed |
177
+ | NFR-008 | Code quality | All functions have docstrings |
178
+ | NFR-009 | Code quality | Type hints on all function signatures |
179
+ | NFR-010 | Portability | Zero OS-specific code — runs on Linux (Docker) |
180
+ | NFR-011 | Security | No hardcoded credentials anywhere in code |
181
+
182
+ ---
183
+
184
+ ## 7. System Architecture
185
+
186
+ ```
187
+ ┌─────────────────────────────────────────────────────────────┐
188
+ │ HF Space / Docker Container │
189
+ │ │
190
+ │ ┌──────────────┐ ┌──────────────────────────────────┐ │
191
+ │ │ Gradio UI │ │ FastAPI Server │ │
192
+ │ │ (port 7860) │ │ POST /reset GET /state │ │
193
+ │ │ │ │ POST /step GET /health │ │
194
+ │ └──────┬───────┘ └──────────────┬───────────────────┘ │
195
+ │ │ │ │
196
+ │ └──────────┬────────────────┘ │
197
+ │ │ │
198
+ │ ┌──────────▼──────────────┐ │
199
+ │ │ InvoiceExceptionEnv │ │
200
+ │ │ reset() step() state() │ │
201
+ │ │ grade() │ │
202
+ │ └──────────┬──────────────┘ │
203
+ │ │ │
204
+ │ ┌──────────▼──────────────┐ │
205
+ │ │ Task Registry │ │
206
+ │ │ task1_price_variance │ │
207
+ │ │ task2_duplicate_tax │ │
208
+ │ │ task3_compound_fraud │ │
209
+ │ └─────────────────────────┘ │
210
+ └─────────────────────────────────────────────────────────────┘
211
+
212
+ ┌─────────────────────────────────────────────────────────────┐
213
+ │ inference.py (agent) │
214
+ │ │
215
+ │ OpenAI Client → env.reset() → loop { │
216
+ │ action = LLM(observation_json) │
217
+ │ result = env.step(action) │
218
+ │ log [STEP] │
219
+ │ } → log [END] │
220
+ └─────────────────────────────────────────────────────────────┘
221
+ ```
222
+
223
+ ### 7.1 Data Flow
224
+
225
+ ```
226
+ Episode start
227
+
228
+
229
+ reset(task_id) ──► builds DocumentPacket + EpisodeData ──► EnvironmentState
230
+
231
+
232
+ step(action) ──► dispatch to task simulator ──► (reward, info)
233
+ │ │
234
+ ▼ ▼
235
+ EpisodeData updated ◄──────────────────── append to history
236
+
237
+
238
+ new EnvironmentState built ──► StepResult(obs, reward, done, info)
239
+
240
+
241
+ grade() ──► EpisodeData ──► grader logic ──► Dict[str, float]
242
+ ```
243
+
244
+ ---
245
+
246
+ ## 8. Task Specifications
247
+
248
+ ### 8.1 Task 1 — Price Variance Exception (Easy)
249
+
250
+ **Scenario:** Office stationery invoice arrives 3.08% above the PO amount. Company tolerance policy is ±2% for auto-approval. The supplier has a verbal approval email from the procurement team explaining a raw material price increase that was never formalised in the PO.
251
+
252
+ **What makes it easy:** Single root cause, all signals are benign (no fraud), the fix is straightforward (confirm with procurement, approve with PO amendment).
253
+
254
+ **Optimal path (9 steps):**
255
+ ```
256
+ run_check(po_match)
257
+ run_check(tolerance_rule) ← finds 3.08% > 2%
258
+ cross_check(unit_price, invoice, po) ← finds two mismatched lines
259
+ run_check(grn_match) ← confirms delivery complete
260
+ query_supplier(reason for increase) ← gets email confirmation
261
+ query_internal(procurement, confirm?) ← procurement confirms verbal approval
262
+ apply_rule(tolerance_exception_approval)
263
+ make_decision(approve, reason)
264
+ route_to(procurement, raise PO amendment)
265
+ close_case(summary)
266
+ ```
267
+
268
+ **Pitfalls:**
269
+ - Rejecting without querying supplier → wrong decision, score capped at ~0.35
270
+ - Approving without checking tolerance rule → policy violation, −0.15
271
+ - Disabling fraud checks that aren't needed → wasted steps
272
+
273
+ **Grader weights:**
274
+ | Sub-score | Max | Key signals |
275
+ |---|---|---|
276
+ | Diagnosis | 0.32 | tolerance_rule check, price mismatch found |
277
+ | Investigation | 0.30 | supplier queried, procurement confirmed |
278
+ | Decision | 0.18 | correct approve decision |
279
+ | Routing | 0.12 | PO amendment sent to procurement |
280
+ | Closure | 0.08 | case closed with summary |
281
+
282
+ ---
283
+
284
+ ### 8.2 Task 2 — Duplicate Invoice with Hidden Tax Error (Medium)
285
+
286
+ **Scenario:** Logistics supplier submits INV-2024-891. System flags it as a possible duplicate of INV-2024-819 (already paid). The invoice numbers differ by a digit transposition (8-9-1 vs 8-1-9). However: the original invoice applied 15% GST (wrong rate); the correct rate is 18%. The company overpaid ₹3,240 in tax on the original invoice. The new invoice has the correct rate. So it is simultaneously a duplicate AND a legitimate correction.
287
+
288
+ **What makes it medium:** The agent must not just detect the duplicate and reject — it must also detect the tax error in the *original* paid invoice and partially approve the correction delta (₹3,240). A simple "reject all duplicates" rule misses this and loses significant score.
289
+
290
+ **Optimal path (10 steps):**
291
+ ```
292
+ run_check(duplicate_detection) ← finds INV-2024-819
293
+ inspect_field(invoice, invoice_number) ← spots digit transposition
294
+ run_check(tax_calculation_verify) ← finds 15% vs 18% on original
295
+ cross_check(tax_amount, invoice, payment_history) ← confirms ₹3,240 delta
296
+ query_internal(finance, confirm overpayment?)
297
+ query_supplier(clarify relationship between invoices)
298
+ apply_rule(partial_approval)
299
+ apply_rule(credit_note_request)
300
+ make_decision(partial_approve, reason)
301
+ route_to(finance, process ₹3,240 correction)
302
+ close_case(summary)
303
+ ```
304
+
305
+ **Pitfalls:**
306
+ - Full rejection (catches duplicate, misses correction): score ~0.35
307
+ - Full approval (pays full duplicate): score −0.15
308
+ - Partial approve without credit note: score ~0.60
309
+
310
+ **Grader weights:**
311
+ | Sub-score | Max | Key signals |
312
+ |---|---|---|
313
+ | Diagnosis | 0.30 | duplicate found, tax error found |
314
+ | Investigation | 0.32 | finance queried, supplier queried, rules applied |
315
+ | Decision | 0.20 | partial_approve correct |
316
+ | Routing | 0.08 | finance routed for correction |
317
+ | Closure | 0.06 | case closed |
318
+
319
+ ---
320
+
321
+ ### 8.3 Task 3 — Compound Fraud Signals (Hard)
322
+
323
+ **Scenario:** IT equipment supplier submits an ₹8,47,500 invoice for 15 laptops. The system flags a bank account change. But there are four simultaneous fraud signals:
324
+
325
+ 1. **Bank account fraud:** Invoice has a different bank account than supplier master. The change request came from a lookalike email domain (techcore-solutions.com vs registered techcore-solutions.in) — a classic Business Email Compromise (BEC) attack.
326
+ 2. **GSTIN fraud:** The GST number on the invoice belongs to a *different company* — "TechCore Trading Pvt Ltd" in Delhi, not "TechCore Solutions."
327
+ 3. **Quantity mismatch:** Invoice claims 15 laptops. GRN confirms only 13 received. 2 are still in transit.
328
+ 4. **Price inflation:** ₹56,500/unit invoiced vs ₹52,000/unit on PO — 8.65% above PO with no approved revision.
329
+
330
+ **Critical trap:** If the agent queries the supplier via email, they contact the fraudster who will try to convince them to approve. The agent must use phone/registered channel.
331
+
332
+ **What makes it hard:** The agent must hold four independent fraud signals simultaneously, not be satisfied by finding one and stopping, detect the correct communication channel policy, and route to both legal and security (not just finance).
333
+
334
+ **Optimal path (12-14 steps):**
335
+ ```
336
+ inspect_field(invoice, bank_account) ← sees mismatch
337
+ cross_check(bank_account, invoice, supplier_master)
338
+ run_check(bank_account_verification) ← finds lookalike domain
339
+ run_check(email_domain_verification)
340
+ inspect_field(invoice, supplier_gstin)
341
+ run_check(gst_verification) ← finds GST belongs to different entity
342
+ cross_check(gstin, invoice, supplier_master)
343
+ inspect_field(grn, items_received)
344
+ run_check(grn_match) ← 13 vs 15
345
+ run_check(price_check) ← 8.65% above PO
346
+ query_supplier(confirm details, channel=phone) ← supplier confirms fraud
347
+ query_internal(security, investigate BEC)
348
+ apply_rule(fraud_hold)
349
+ make_decision(reject, all fraud signals documented)
350
+ route_to(legal, initiate supplier audit)
351
+ route_to(security, BEC investigation)
352
+ close_case(fraud report summary)
353
+ ```
354
+
355
+ **Critical pitfall — contacting via email:** −0.15 reward, and agent receives fraudster's response trying to get payment approved. Scoring penalises this heavily.
356
+
357
+ **Grader weights:**
358
+ | Sub-score | Max | Key signals |
359
+ |---|---|---|
360
+ | Diagnosis | 0.50 | bank fraud, GST fraud, quantity mismatch, domain lookalike, price inflation |
361
+ | Investigation | 0.20 | phone contact (not email), security queried, legal queried |
362
+ | Decision | 0.20 | reject with all signals documented |
363
+ | Routing | 0.20 | legal + security routed |
364
+ | Closure | 0.06 | case closed with fraud report |
365
+
366
+ **Scoring thresholds:**
367
+ - Find 1 signal: ~0.20
368
+ - Find 2 signals: ~0.40
369
+ - Find 3 signals: ~0.60
370
+ - Find all 4 + correct routing: ~0.90+
371
+
372
+ ---
373
+
374
+ ## 9. Reward Design
375
+
376
+ ### 9.1 Philosophy
377
+
378
+ The reward function is designed around three principles:
379
+
380
+ **Principle 1: Every informative action gets signal.** Agents should learn that investigating is always better than guessing. Each relevant inspection, check, or query returns a positive reward proportional to how diagnostic that action is.
381
+
382
+ **Principle 2: Dangerous actions get crushed.** Approving a fraudulent invoice, disabling security controls, or contacting a supplier via a compromised channel are not mistakes — they are catastrophic errors. These must receive large negative rewards so agents learn to avoid them unconditionally.
383
+
384
+ **Principle 3: The grader is the ground truth, the shaped reward is the training signal.** The episode reward is shaped to help agents learn. The grader score at the end is what actually measures quality.
385
+
386
+ ### 9.2 Reward Table
387
+
388
+ | Action | Reward Range | Notes |
389
+ |---|---|---|
390
+ | `inspect_field` (relevant) | +0.01 to +0.14 | Higher for fields that reveal anomalies |
391
+ | `inspect_field` (irrelevant) | +0.01 | Still small positive — exploration is fine |
392
+ | `cross_check` (finds mismatch) | +0.12 to +0.15 | Diagnosis reward |
393
+ | `cross_check` (no mismatch) | +0.02 | Confirms a clean field |
394
+ | `run_check` (finds issue) | +0.08 to +0.18 | Higher for more diagnostic checks |
395
+ | `run_check` (clean) | +0.01 to +0.06 | Clean checks still confirm facts |
396
+ | `query_supplier` (phone) | +0.10 to +0.15 | Correct channel |
397
+ | `query_supplier` (email, fraud task) | −0.15 | Contacts fraudster |
398
+ | `query_internal` (key dept) | +0.04 to +0.12 | Higher for departments that add critical info |
399
+ | `apply_rule` (correct rule) | +0.08 to +0.12 | Applying the right policy pathway |
400
+ | `apply_rule` (wrong rule) | −0.05 to −0.10 | Misapplying policy |
401
+ | `make_decision` (correct) | +0.18 to +0.28 | Correct decision based on evidence |
402
+ | `make_decision` (wrong) | −0.10 to −0.40 | Severity scales with how wrong |
403
+ | `route_to` (correct team) | +0.06 to +0.14 | Right escalation path |
404
+ | `close_case` (complete) | +0.06 to +0.12 | Depends on decision quality |
405
+ | Repeat action | −0.02 to −0.05 | Light penalty, not catastrophic |
406
+ | SLA breach (exceed max steps) | −0.10 | One-time penalty at end |
407
+
408
+ ### 9.3 Episode Score vs Cumulative Reward
409
+
410
+ These are different numbers:
411
+ - **Cumulative reward** is the sum of step rewards. It is used as a training signal.
412
+ - **Episode score** (from `grade()`) is the holistic quality assessment. It is what the hackathon evaluates.
413
+
414
+ Agents should be optimised on the grade score, not the cumulative reward alone.
415
+
416
+ ---
417
+
418
+ ## 10. Evaluation Criteria
419
+
420
+ ### 10.1 Hackathon Scoring
421
+
422
+ | Criterion | Weight | What judges look for |
423
+ |---|---|---|
424
+ | Real-world utility | 30% | Would an enterprise actually use this? Does it model the task faithfully? |
425
+ | Task & grader quality | 25% | Clear objectives, accurate grading, genuine difficulty progression, frontier models challenged |
426
+ | Environment design | 20% | Clean state management, good action/observation spaces, shaped reward, sensible episode boundaries |
427
+ | Code quality & spec compliance | 15% | OpenEnv spec passes, Dockerfile works, baseline reproduces, typed models |
428
+ | Creativity & novelty | 10% | Novel domain, interesting mechanics, original reward design |
429
+
430
+ ### 10.2 Automated Gates (must all pass)
431
+
432
+ 1. HF Space deploys — `POST /reset` returns 200
433
+ 2. `openenv validate` passes
434
+ 3. `docker build` succeeds
435
+ 4. `python inference.py` runs without error, produces scores
436
+ 5. All 3 tasks enumerated, grader scores verified in [0.0, 1.0]
437
+
438
+ ### 10.3 Phase 2 — Agentic Evaluation
439
+
440
+ The hackathon will run a standard open LLM agent (e.g. Nemotron 3 Super) against the environment. The environment must:
441
+ - Not be trivially solvable by a greedy agent
442
+ - Produce score variance across tasks (not all the same)
443
+ - Penalise clearly suboptimal behaviour
444
+
445
+ ### 10.4 Disqualifiers
446
+
447
+ - Environment does not deploy or respond to `/reset`
448
+ - Graders that always return the same score regardless of actions
449
+ - `inference.py` not in root, or not using OpenAI client
450
+ - No baseline scores produced
451
+ - Plagiarised environment
452
+
453
+ ---
454
+
455
+ ## 11. API Contract
456
+
457
+ ### 11.1 Environment Python API
458
+
459
+ ```python
460
+ env = InvoiceExceptionEnv(seed=42)
461
+
462
+ # Reset — returns EnvironmentState
463
+ obs: EnvironmentState = env.reset("task1_price_variance")
464
+
465
+ # Step — returns StepResult
466
+ result: StepResult = env.step(Action.run_check("tolerance_rule"))
467
+ # result.observation → EnvironmentState
468
+ # result.reward → float
469
+ # result.done → bool
470
+ # result.info → dict
471
+
472
+ # State — non-destructive peek
473
+ obs: EnvironmentState = env.state()
474
+
475
+ # Grade — run grader on episode
476
+ scores: dict = env.grade()
477
+ # scores["score"] → 0.0–1.0 overall
478
+ # scores["diagnosis_score"] → float
479
+ # scores["decision_score"] → float
480
+ # ...
481
+ ```
482
+
483
+ ### 11.2 HTTP API
484
+
485
+ ```
486
+ POST /reset
487
+ Body: {"task_id": "task1_price_variance"} (optional — random if omitted)
488
+ Response: 200 EnvironmentState JSON
489
+
490
+ POST /step
491
+ Body: {"type": "run_check", "params": {"check_name": "tolerance_rule"}}
492
+ Response: 200 StepResult JSON
493
+
494
+ GET /state
495
+ Response: 200 EnvironmentState JSON
496
+
497
+ POST /grade
498
+ Response: 200 {"score": 0.85, "diagnosis_score": ...}
499
+
500
+ GET /tasks
501
+ Response: 200 ["task1_price_variance", "task2_duplicate_tax", "task3_compound_fraud"]
502
+
503
+ GET /health
504
+ Response: 200 {"status": "ok", "version": "1.0.0"}
505
+ ```
506
+
507
+ ### 11.3 Action Schema
508
+
509
+ ```json
510
+ {
511
+ "type": "run_check",
512
+ "params": {"check_name": "tolerance_rule"}
513
+ }
514
+
515
+ {
516
+ "type": "inspect_field",
517
+ "params": {"document": "invoice", "field": "bank_account"}
518
+ }
519
+
520
+ {
521
+ "type": "cross_check",
522
+ "params": {"field": "unit_price", "doc_a": "invoice", "doc_b": "po"}
523
+ }
524
+
525
+ {
526
+ "type": "query_supplier",
527
+ "params": {"question": "Why does your bank account differ?", "channel": "phone"}
528
+ }
529
+
530
+ {
531
+ "type": "query_internal",
532
+ "params": {"department": "procurement", "question": "Did you approve this price?"}
533
+ }
534
+
535
+ {
536
+ "type": "apply_rule",
537
+ "params": {"rule_id": "tolerance_exception_approval"}
538
+ }
539
+
540
+ {
541
+ "type": "make_decision",
542
+ "params": {"decision": "approve", "reason": "Verbal approval confirmed by procurement."}
543
+ }
544
+
545
+ {
546
+ "type": "route_to",
547
+ "params": {"team": "procurement", "notes": "Please raise PO amendment for the price variance."}
548
+ }
549
+
550
+ {
551
+ "type": "close_case",
552
+ "params": {"summary": "Invoice approved. PO amendment requested. Case closed."}
553
+ }
554
+ ```
555
+
556
+ ---
557
+
558
+ ## 12. File Structure
559
+
560
+ ```
561
+ invoice-exception-handler/
562
+
563
+ ├── README.md # Full setup + usage guide
564
+ ├── openenv.yaml # OpenEnv spec (must pass openenv validate)
565
+ ├── Dockerfile # Single-stage Python 3.11-slim build
566
+ ├── requirements.txt # Pinned dependencies
567
+ ├── inference.py # Competition inference script (MUST be here)
568
+ ├── app.py # Gradio + FastAPI entrypoint for HF Spaces
569
+
570
+ ├── env/
571
+ │ ├── __init__.py
572
+ │ ├── models.py # All Pydantic typed models
573
+ │ ├── environment.py # InvoiceExceptionEnv class
574
+ │ └── tasks.py # 3 task classes + graders + EpisodeData
575
+
576
+ └── documents/
577
+ ├── PRD-001-product-requirements.md # This document
578
+ ├── CHANGELOG.md # Every code change recorded
579
+ ├── ARCHITECTURE.md # System diagram + decisions
580
+ └── BASELINE-SCORES.md # Reproducible benchmark results
581
+ ```
582
+
583
+ ---
584
+
585
+ ## 13. Out of Scope
586
+
587
+ The following are explicitly not part of v1.0:
588
+
589
+ - Real database connectivity (the environment is fully simulated)
590
+ - Multi-agent scenarios (one agent per episode)
591
+ - Partial observability (agent sees all documents from the start)
592
+ - User interface for human play (nice-to-have but not required for submission)
593
+ - Real supplier APIs (simulation only)
594
+ - Currency other than INR (can be extended in v1.1)
595
+ - Tasks beyond 3 (can be extended)
596
+
597
+ ---
598
+
599
+ ## 14. Change Log
600
+
601
+ | Version | Date | Author | Change |
602
+ |---|---|---|---|
603
+ | 0.1.0 | 2025-01-18 | [Author] | Initial draft — problem definition and task sketches |
604
+ | 0.2.0 | 2025-01-19 | [Author] | Added reward design section, API contract, file structure |
605
+ | 1.0.0 | 2025-01-20 | [Author] | Final version — all sections complete, ready for implementation |
documents/PRD.md ADDED
@@ -0,0 +1,605 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Product Requirements Document
2
+ ## Invoice Exception Handler — OpenEnv Agent Learning Environment
3
+
4
+ **Document ID:** PRD-001
5
+ **Version:** 1.0.0
6
+ **Status:** Final
7
+ **Author:** [Your Name]
8
+ **Last Updated:** 2025-01-20
9
+ **Classification:** Internal / Hackathon Submission
10
+
11
+ ---
12
+
13
+ ## Table of Contents
14
+
15
+ 1. [Executive Summary](#1-executive-summary)
16
+ 2. [Problem Statement](#2-problem-statement)
17
+ 3. [Product Vision](#3-product-vision)
18
+ 4. [Stakeholders](#4-stakeholders)
19
+ 5. [Functional Requirements](#5-functional-requirements)
20
+ 6. [Non-Functional Requirements](#6-non-functional-requirements)
21
+ 7. [System Architecture](#7-system-architecture)
22
+ 8. [Task Specifications](#8-task-specifications)
23
+ 9. [Reward Design](#9-reward-design)
24
+ 10. [Evaluation Criteria](#10-evaluation-criteria)
25
+ 11. [API Contract](#11-api-contract)
26
+ 12. [File Structure](#12-file-structure)
27
+ 13. [Out of Scope](#13-out-of-scope)
28
+ 14. [Change Log](#14-change-log)
29
+
30
+ ---
31
+
32
+ ## 1. Executive Summary
33
+
34
+ The Invoice Exception Handler is a real-world agent learning environment built for the OpenEnv standard. It simulates the accounts payable (AP) exception handling workflow that every business on earth runs daily — the process of investigating flagged invoices before payment is approved.
35
+
36
+ The environment places an AI agent in the role of an AP analyst. The agent receives a document packet (Purchase Order, Invoice, Goods Receipt Note, Supplier Master), reads an exception flag, and must investigate the root cause, make a decision, route the case to the right team, and close it cleanly. Every action has realistic financial and compliance consequences.
37
+
38
+ The environment ships with three tasks of increasing difficulty — price variance (easy), duplicate with hidden tax error (medium), and compound fraud with four simultaneous signals (hard).
39
+
40
+ ---
41
+
42
+ ## 2. Problem Statement
43
+
44
+ ### 2.1 The Real-World Pain
45
+
46
+ Every company that buys goods or services from suppliers receives invoices. Typically 5–15% of all invoices have exceptions — discrepancies between what was ordered (PO), what was received (GRN), and what was invoiced. These exceptions are currently handled by accounts payable clerks who manually:
47
+
48
+ 1. Pull the original Purchase Order
49
+ 2. Compare it line by line against the invoice
50
+ 3. Check the Goods Receipt Note
51
+ 4. Run validation checks
52
+ 5. Query internal teams or the supplier
53
+ 6. Make a decision (approve / reject / hold / partial approve)
54
+ 7. Route the case and document everything
55
+
56
+ At a mid-size company this is 2–4 hours of analyst time per day. At enterprise scale it is entire departments. The cost to the AP automation market exceeds $3 billion annually.
57
+
58
+ ### 2.2 The AI Gap
59
+
60
+ No existing OpenEnv benchmark tests an agent's ability to:
61
+ - Reason across multiple documents simultaneously
62
+ - Apply business rules with thresholds and exceptions
63
+ - Detect fraud signals that require cross-referencing
64
+ - Make nuanced decisions (partial approve, hold, escalate)
65
+ - Know *not* to contact a supplier via a potentially compromised channel
66
+
67
+ This gap means agents trained on existing benchmarks cannot be evaluated or trained on one of the most common finance workflows in enterprise software.
68
+
69
+ ### 2.3 What This Environment Fixes
70
+
71
+ The Invoice Exception Handler provides:
72
+ - A clean, typed, deterministic simulation of AP exception handling
73
+ - Three tasks that test a progression of reasoning: threshold logic → duplicate detection → multi-signal fraud
74
+ - Shaped rewards that signal progress at every step, not just at episode end
75
+ - A fully deployable environment that conforms to the OpenEnv spec
76
+
77
+ ---
78
+
79
+ ## 3. Product Vision
80
+
81
+ > An agent that scores well in this environment is demonstrably better at AP exception handling than the average accounts payable clerk — and is ready to be deployed in real enterprise finance workflows.
82
+
83
+ The environment is designed so that:
84
+ - The reward signal is meaningful enough to actually train agents on, not just evaluate them
85
+ - The hard task (compound fraud) remains genuinely difficult for frontier models
86
+ - Every score between 0.0 and 1.0 reflects a real quality difference in agent behavior
87
+
88
+ ---
89
+
90
+ ## 4. Stakeholders
91
+
92
+ | Stakeholder | Role | Interest |
93
+ |---|---|---|
94
+ | Hackathon Judges (Meta, HF engineers) | Evaluators | Real-world utility, code quality, creativity |
95
+ | OpenEnv Automated Validator | Gatekeeper | Spec compliance, deployment health |
96
+ | AI Researchers | Primary users post-submission | Training and evaluating AP agents |
97
+ | Enterprise Software Companies | Secondary users | Evaluating models for AP automation products |
98
+
99
+ ---
100
+
101
+ ## 5. Functional Requirements
102
+
103
+ ### 5.1 Core Environment API
104
+
105
+ | Requirement | Priority | Detail |
106
+ |---|---|---|
107
+ | FR-001 | MUST | `env.reset(task_id)` returns a clean `EnvironmentState` |
108
+ | FR-002 | MUST | `env.step(action)` returns `StepResult(observation, reward, done, info)` |
109
+ | FR-003 | MUST | `env.state()` returns current state without advancing episode |
110
+ | FR-004 | MUST | `env.grade()` returns a score dict with overall score 0.0–1.0 |
111
+ | FR-005 | MUST | All models are typed Pydantic v2 with no untyped fields |
112
+ | FR-006 | MUST | `openenv.yaml` passes `openenv validate` |
113
+
114
+ ### 5.2 HTTP Endpoints (for HF Spaces validator)
115
+
116
+ | Requirement | Priority | Detail |
117
+ |---|---|---|
118
+ | FR-007 | MUST | `POST /reset` returns HTTP 200 with JSON observation |
119
+ | FR-008 | MUST | `POST /step` returns HTTP 200 with JSON StepResult |
120
+ | FR-009 | MUST | `GET /state` returns HTTP 200 with JSON EnvironmentState |
121
+ | FR-010 | MUST | `GET /health` returns HTTP 200 `{"status": "ok"}` |
122
+ | FR-011 | SHOULD | `GET /` returns HTML documentation page |
123
+
124
+ ### 5.3 Task Requirements
125
+
126
+ | Requirement | Priority | Detail |
127
+ |---|---|---|
128
+ | FR-012 | MUST | Minimum 3 tasks with distinct scenarios |
129
+ | FR-013 | MUST | Tasks range easy → medium → hard |
130
+ | FR-014 | MUST | Each task has a deterministic grader returning 0.0–1.0 |
131
+ | FR-015 | MUST | Graders have sub-scores (diagnosis, investigation, decision, routing, closure, efficiency) |
132
+ | FR-016 | MUST | Hard task must not be solvable by simple heuristics |
133
+
134
+ ### 5.4 Reward Function
135
+
136
+ | Requirement | Priority | Detail |
137
+ |---|---|---|
138
+ | FR-017 | MUST | Reward is shaped across the full trajectory |
139
+ | FR-018 | MUST | Dangerous actions (approving fraud) produce large negative rewards |
140
+ | FR-019 | MUST | Repeating already-completed actions penalised lightly |
141
+ | FR-020 | MUST | Exceeding step budget penalised (SLA concept) |
142
+ | FR-021 | SHOULD | Efficiency bonus for completing faster than optimal |
143
+
144
+ ### 5.5 Inference Script
145
+
146
+ | Requirement | Priority | Detail |
147
+ |---|---|---|
148
+ | FR-022 | MUST | Script named exactly `inference.py` in root directory |
149
+ | FR-023 | MUST | Uses OpenAI client (not Anthropic SDK) |
150
+ | FR-024 | MUST | Reads `API_BASE_URL`, `MODEL_NAME`, `HF_TOKEN` from environment |
151
+ | FR-025 | MUST | Emits `[START]`, `[STEP]`, `[END]` lines to stdout exactly as spec |
152
+ | FR-026 | MUST | Completes all 3 tasks in under 20 minutes on 2 vCPU / 8 GB RAM |
153
+ | FR-027 | MUST | Produces reproducible scores with the same seed |
154
+
155
+ ### 5.6 Deployment
156
+
157
+ | Requirement | Priority | Detail |
158
+ |---|---|---|
159
+ | FR-028 | MUST | Dockerfile builds cleanly without internet access at run time |
160
+ | FR-029 | MUST | Container starts and serves on port 7860 |
161
+ | FR-030 | MUST | HF Spaces `POST /reset` returns 200 |
162
+ | FR-031 | MUST | README documents setup, action space, observation space, tasks, baseline scores |
163
+
164
+ ---
165
+
166
+ ## 6. Non-Functional Requirements
167
+
168
+ | ID | Category | Requirement |
169
+ |---|---|---|
170
+ | NFR-001 | Performance | `reset()` completes in < 100ms |
171
+ | NFR-002 | Performance | `step()` completes in < 50ms |
172
+ | NFR-003 | Performance | Full 3-task inference run completes in < 20 minutes |
173
+ | NFR-004 | Resource | Runs on 2 vCPU, 8 GB RAM — no GPU required |
174
+ | NFR-005 | Correctness | Grader output is deterministic — same actions = same score |
175
+ | NFR-006 | Correctness | Reward values are deterministic — no randomness in simulation |
176
+ | NFR-007 | Code quality | No bare `except:` blocks — all exceptions typed |
177
+ | NFR-008 | Code quality | All functions have docstrings |
178
+ | NFR-009 | Code quality | Type hints on all function signatures |
179
+ | NFR-010 | Portability | Zero OS-specific code — runs on Linux (Docker) |
180
+ | NFR-011 | Security | No hardcoded credentials anywhere in code |
181
+
182
+ ---
183
+
184
+ ## 7. System Architecture
185
+
186
+ ```
187
+ ┌─────────────────────────────────────────────────────────────┐
188
+ │ HF Space / Docker Container │
189
+ │ │
190
+ │ ┌──────────────┐ ┌──────────────────────────────────┐ │
191
+ │ │ Gradio UI │ │ FastAPI Server │ │
192
+ │ │ (port 7860) │ │ POST /reset GET /state │ │
193
+ │ │ │ │ POST /step GET /health │ │
194
+ │ └──────┬───────┘ └──────────────┬───────────────────┘ │
195
+ │ │ │ │
196
+ │ └──────────┬────────────────┘ │
197
+ │ │ │
198
+ │ ┌──────────▼──────────────┐ │
199
+ │ │ InvoiceExceptionEnv │ │
200
+ │ │ reset() step() state() │ │
201
+ │ │ grade() │ │
202
+ │ └──────────┬──────────────┘ │
203
+ │ │ │
204
+ │ ┌──────────▼──────────────┐ │
205
+ │ │ Task Registry │ │
206
+ │ │ task1_price_variance │ │
207
+ │ │ task2_duplicate_tax │ │
208
+ │ │ task3_compound_fraud │ │
209
+ │ └─────────────────────────┘ │
210
+ └─────────────────────────────────────────────────────────────┘
211
+
212
+ ┌─────────────────────────────────────────────────────────────┐
213
+ │ inference.py (agent) │
214
+ │ │
215
+ │ OpenAI Client → env.reset() → loop { │
216
+ │ action = LLM(observation_json) │
217
+ │ result = env.step(action) │
218
+ │ log [STEP] │
219
+ │ } → log [END] │
220
+ └─────────────────────────────────────────────────────────────┘
221
+ ```
222
+
223
+ ### 7.1 Data Flow
224
+
225
+ ```
226
+ Episode start
227
+
228
+
229
+ reset(task_id) ──► builds DocumentPacket + EpisodeData ──► EnvironmentState
230
+
231
+
232
+ step(action) ──► dispatch to task simulator ──► (reward, info)
233
+ │ │
234
+ ▼ ▼
235
+ EpisodeData updated ◄──────────────────── append to history
236
+
237
+
238
+ new EnvironmentState built ──► StepResult(obs, reward, done, info)
239
+
240
+
241
+ grade() ──► EpisodeData ──► grader logic ──► Dict[str, float]
242
+ ```
243
+
244
+ ---
245
+
246
+ ## 8. Task Specifications
247
+
248
+ ### 8.1 Task 1 — Price Variance Exception (Easy)
249
+
250
+ **Scenario:** Office stationery invoice arrives 3.08% above the PO amount. Company tolerance policy is ±2% for auto-approval. The supplier has a verbal approval email from the procurement team explaining a raw material price increase that was never formalised in the PO.
251
+
252
+ **What makes it easy:** Single root cause, all signals are benign (no fraud), the fix is straightforward (confirm with procurement, approve with PO amendment).
253
+
254
+ **Optimal path (9 steps):**
255
+ ```
256
+ run_check(po_match)
257
+ run_check(tolerance_rule) ← finds 3.08% > 2%
258
+ cross_check(unit_price, invoice, po) ← finds two mismatched lines
259
+ run_check(grn_match) ← confirms delivery complete
260
+ query_supplier(reason for increase) ← gets email confirmation
261
+ query_internal(procurement, confirm?) ← procurement confirms verbal approval
262
+ apply_rule(tolerance_exception_approval)
263
+ make_decision(approve, reason)
264
+ route_to(procurement, raise PO amendment)
265
+ close_case(summary)
266
+ ```
267
+
268
+ **Pitfalls:**
269
+ - Rejecting without querying supplier → wrong decision, score capped at ~0.35
270
+ - Approving without checking tolerance rule → policy violation, −0.15
271
+ - Disabling fraud checks that aren't needed → wasted steps
272
+
273
+ **Grader weights:**
274
+ | Sub-score | Max | Key signals |
275
+ |---|---|---|
276
+ | Diagnosis | 0.32 | tolerance_rule check, price mismatch found |
277
+ | Investigation | 0.30 | supplier queried, procurement confirmed |
278
+ | Decision | 0.18 | correct approve decision |
279
+ | Routing | 0.12 | PO amendment sent to procurement |
280
+ | Closure | 0.08 | case closed with summary |
281
+
282
+ ---
283
+
284
+ ### 8.2 Task 2 — Duplicate Invoice with Hidden Tax Error (Medium)
285
+
286
+ **Scenario:** Logistics supplier submits INV-2024-891. System flags it as a possible duplicate of INV-2024-819 (already paid). The invoice numbers differ by a digit transposition (8-9-1 vs 8-1-9). However: the original invoice applied 15% GST (wrong rate); the correct rate is 18%. The company overpaid ₹3,240 in tax on the original invoice. The new invoice has the correct rate. So it is simultaneously a duplicate AND a legitimate correction.
287
+
288
+ **What makes it medium:** The agent must not just detect the duplicate and reject — it must also detect the tax error in the *original* paid invoice and partially approve the correction delta (₹3,240). A simple "reject all duplicates" rule misses this and loses significant score.
289
+
290
+ **Optimal path (10 steps):**
291
+ ```
292
+ run_check(duplicate_detection) ← finds INV-2024-819
293
+ inspect_field(invoice, invoice_number) ← spots digit transposition
294
+ run_check(tax_calculation_verify) ← finds 15% vs 18% on original
295
+ cross_check(tax_amount, invoice, payment_history) ← confirms ₹3,240 delta
296
+ query_internal(finance, confirm overpayment?)
297
+ query_supplier(clarify relationship between invoices)
298
+ apply_rule(partial_approval)
299
+ apply_rule(credit_note_request)
300
+ make_decision(partial_approve, reason)
301
+ route_to(finance, process ₹3,240 correction)
302
+ close_case(summary)
303
+ ```
304
+
305
+ **Pitfalls:**
306
+ - Full rejection (catches duplicate, misses correction): score ~0.35
307
+ - Full approval (pays full duplicate): score −0.15
308
+ - Partial approve without credit note: score ~0.60
309
+
310
+ **Grader weights:**
311
+ | Sub-score | Max | Key signals |
312
+ |---|---|---|
313
+ | Diagnosis | 0.30 | duplicate found, tax error found |
314
+ | Investigation | 0.32 | finance queried, supplier queried, rules applied |
315
+ | Decision | 0.20 | partial_approve correct |
316
+ | Routing | 0.08 | finance routed for correction |
317
+ | Closure | 0.06 | case closed |
318
+
319
+ ---
320
+
321
+ ### 8.3 Task 3 — Compound Fraud Signals (Hard)
322
+
323
+ **Scenario:** IT equipment supplier submits an ₹8,47,500 invoice for 15 laptops. The system flags a bank account change. But there are four simultaneous fraud signals:
324
+
325
+ 1. **Bank account fraud:** Invoice has a different bank account than supplier master. The change request came from a lookalike email domain (techcore-solutions.com vs registered techcore-solutions.in) — a classic Business Email Compromise (BEC) attack.
326
+ 2. **GSTIN fraud:** The GST number on the invoice belongs to a *different company* — "TechCore Trading Pvt Ltd" in Delhi, not "TechCore Solutions."
327
+ 3. **Quantity mismatch:** Invoice claims 15 laptops. GRN confirms only 13 received. 2 are still in transit.
328
+ 4. **Price inflation:** ₹56,500/unit invoiced vs ₹52,000/unit on PO — 8.65% above PO with no approved revision.
329
+
330
+ **Critical trap:** If the agent queries the supplier via email, they contact the fraudster who will try to convince them to approve. The agent must use phone/registered channel.
331
+
332
+ **What makes it hard:** The agent must hold four independent fraud signals simultaneously, not be satisfied by finding one and stopping, detect the correct communication channel policy, and route to both legal and security (not just finance).
333
+
334
+ **Optimal path (12-14 steps):**
335
+ ```
336
+ inspect_field(invoice, bank_account) ← sees mismatch
337
+ cross_check(bank_account, invoice, supplier_master)
338
+ run_check(bank_account_verification) ← finds lookalike domain
339
+ run_check(email_domain_verification)
340
+ inspect_field(invoice, supplier_gstin)
341
+ run_check(gst_verification) ← finds GST belongs to different entity
342
+ cross_check(gstin, invoice, supplier_master)
343
+ inspect_field(grn, items_received)
344
+ run_check(grn_match) ← 13 vs 15
345
+ run_check(price_check) ← 8.65% above PO
346
+ query_supplier(confirm details, channel=phone) ← supplier confirms fraud
347
+ query_internal(security, investigate BEC)
348
+ apply_rule(fraud_hold)
349
+ make_decision(reject, all fraud signals documented)
350
+ route_to(legal, initiate supplier audit)
351
+ route_to(security, BEC investigation)
352
+ close_case(fraud report summary)
353
+ ```
354
+
355
+ **Critical pitfall — contacting via email:** −0.15 reward, and agent receives fraudster's response trying to get payment approved. Scoring penalises this heavily.
356
+
357
+ **Grader weights:**
358
+ | Sub-score | Max | Key signals |
359
+ |---|---|---|
360
+ | Diagnosis | 0.50 | bank fraud, GST fraud, quantity mismatch, domain lookalike, price inflation |
361
+ | Investigation | 0.20 | phone contact (not email), security queried, legal queried |
362
+ | Decision | 0.20 | reject with all signals documented |
363
+ | Routing | 0.20 | legal + security routed |
364
+ | Closure | 0.06 | case closed with fraud report |
365
+
366
+ **Scoring thresholds:**
367
+ - Find 1 signal: ~0.20
368
+ - Find 2 signals: ~0.40
369
+ - Find 3 signals: ~0.60
370
+ - Find all 4 + correct routing: ~0.90+
371
+
372
+ ---
373
+
374
+ ## 9. Reward Design
375
+
376
+ ### 9.1 Philosophy
377
+
378
+ The reward function is designed around three principles:
379
+
380
+ **Principle 1: Every informative action gets signal.** Agents should learn that investigating is always better than guessing. Each relevant inspection, check, or query returns a positive reward proportional to how diagnostic that action is.
381
+
382
+ **Principle 2: Dangerous actions get crushed.** Approving a fraudulent invoice, disabling security controls, or contacting a supplier via a compromised channel are not mistakes — they are catastrophic errors. These must receive large negative rewards so agents learn to avoid them unconditionally.
383
+
384
+ **Principle 3: The grader is the ground truth, the shaped reward is the training signal.** The episode reward is shaped to help agents learn. The grader score at the end is what actually measures quality.
385
+
386
+ ### 9.2 Reward Table
387
+
388
+ | Action | Reward Range | Notes |
389
+ |---|---|---|
390
+ | `inspect_field` (relevant) | +0.01 to +0.14 | Higher for fields that reveal anomalies |
391
+ | `inspect_field` (irrelevant) | +0.01 | Still small positive — exploration is fine |
392
+ | `cross_check` (finds mismatch) | +0.12 to +0.15 | Diagnosis reward |
393
+ | `cross_check` (no mismatch) | +0.02 | Confirms a clean field |
394
+ | `run_check` (finds issue) | +0.08 to +0.18 | Higher for more diagnostic checks |
395
+ | `run_check` (clean) | +0.01 to +0.06 | Clean checks still confirm facts |
396
+ | `query_supplier` (phone) | +0.10 to +0.15 | Correct channel |
397
+ | `query_supplier` (email, fraud task) | −0.15 | Contacts fraudster |
398
+ | `query_internal` (key dept) | +0.04 to +0.12 | Higher for departments that add critical info |
399
+ | `apply_rule` (correct rule) | +0.08 to +0.12 | Applying the right policy pathway |
400
+ | `apply_rule` (wrong rule) | −0.05 to −0.10 | Misapplying policy |
401
+ | `make_decision` (correct) | +0.18 to +0.28 | Correct decision based on evidence |
402
+ | `make_decision` (wrong) | −0.10 to −0.40 | Severity scales with how wrong |
403
+ | `route_to` (correct team) | +0.06 to +0.14 | Right escalation path |
404
+ | `close_case` (complete) | +0.06 to +0.12 | Depends on decision quality |
405
+ | Repeat action | −0.02 to −0.05 | Light penalty, not catastrophic |
406
+ | SLA breach (exceed max steps) | −0.10 | One-time penalty at end |
407
+
408
+ ### 9.3 Episode Score vs Cumulative Reward
409
+
410
+ These are different numbers:
411
+ - **Cumulative reward** is the sum of step rewards. It is used as a training signal.
412
+ - **Episode score** (from `grade()`) is the holistic quality assessment. It is what the hackathon evaluates.
413
+
414
+ Agents should be optimised on the grade score, not the cumulative reward alone.
415
+
416
+ ---
417
+
418
+ ## 10. Evaluation Criteria
419
+
420
+ ### 10.1 Hackathon Scoring
421
+
422
+ | Criterion | Weight | What judges look for |
423
+ |---|---|---|
424
+ | Real-world utility | 30% | Would an enterprise actually use this? Does it model the task faithfully? |
425
+ | Task & grader quality | 25% | Clear objectives, accurate grading, genuine difficulty progression, frontier models challenged |
426
+ | Environment design | 20% | Clean state management, good action/observation spaces, shaped reward, sensible episode boundaries |
427
+ | Code quality & spec compliance | 15% | OpenEnv spec passes, Dockerfile works, baseline reproduces, typed models |
428
+ | Creativity & novelty | 10% | Novel domain, interesting mechanics, original reward design |
429
+
430
+ ### 10.2 Automated Gates (must all pass)
431
+
432
+ 1. HF Space deploys — `POST /reset` returns 200
433
+ 2. `openenv validate` passes
434
+ 3. `docker build` succeeds
435
+ 4. `python inference.py` runs without error, produces scores
436
+ 5. All 3 tasks enumerated, grader scores verified in [0.0, 1.0]
437
+
438
+ ### 10.3 Phase 2 — Agentic Evaluation
439
+
440
+ The hackathon will run a standard open LLM agent (e.g. Nemotron 3 Super) against the environment. The environment must:
441
+ - Not be trivially solvable by a greedy agent
442
+ - Produce score variance across tasks (not all the same)
443
+ - Penalise clearly suboptimal behaviour
444
+
445
+ ### 10.4 Disqualifiers
446
+
447
+ - Environment does not deploy or respond to `/reset`
448
+ - Graders that always return the same score regardless of actions
449
+ - `inference.py` not in root, or not using OpenAI client
450
+ - No baseline scores produced
451
+ - Plagiarised environment
452
+
453
+ ---
454
+
455
+ ## 11. API Contract
456
+
457
+ ### 11.1 Environment Python API
458
+
459
+ ```python
460
+ env = InvoiceExceptionEnv(seed=42)
461
+
462
+ # Reset — returns EnvironmentState
463
+ obs: EnvironmentState = env.reset("task1_price_variance")
464
+
465
+ # Step — returns StepResult
466
+ result: StepResult = env.step(Action.run_check("tolerance_rule"))
467
+ # result.observation → EnvironmentState
468
+ # result.reward → float
469
+ # result.done → bool
470
+ # result.info → dict
471
+
472
+ # State — non-destructive peek
473
+ obs: EnvironmentState = env.state()
474
+
475
+ # Grade — run grader on episode
476
+ scores: dict = env.grade()
477
+ # scores["score"] → 0.0–1.0 overall
478
+ # scores["diagnosis_score"] → float
479
+ # scores["decision_score"] → float
480
+ # ...
481
+ ```
482
+
483
+ ### 11.2 HTTP API
484
+
485
+ ```
486
+ POST /reset
487
+ Body: {"task_id": "task1_price_variance"} (optional — random if omitted)
488
+ Response: 200 EnvironmentState JSON
489
+
490
+ POST /step
491
+ Body: {"type": "run_check", "params": {"check_name": "tolerance_rule"}}
492
+ Response: 200 StepResult JSON
493
+
494
+ GET /state
495
+ Response: 200 EnvironmentState JSON
496
+
497
+ POST /grade
498
+ Response: 200 {"score": 0.85, "diagnosis_score": ...}
499
+
500
+ GET /tasks
501
+ Response: 200 ["task1_price_variance", "task2_duplicate_tax", "task3_compound_fraud"]
502
+
503
+ GET /health
504
+ Response: 200 {"status": "ok", "version": "1.0.0"}
505
+ ```
506
+
507
+ ### 11.3 Action Schema
508
+
509
+ ```json
510
+ {
511
+ "type": "run_check",
512
+ "params": {"check_name": "tolerance_rule"}
513
+ }
514
+
515
+ {
516
+ "type": "inspect_field",
517
+ "params": {"document": "invoice", "field": "bank_account"}
518
+ }
519
+
520
+ {
521
+ "type": "cross_check",
522
+ "params": {"field": "unit_price", "doc_a": "invoice", "doc_b": "po"}
523
+ }
524
+
525
+ {
526
+ "type": "query_supplier",
527
+ "params": {"question": "Why does your bank account differ?", "channel": "phone"}
528
+ }
529
+
530
+ {
531
+ "type": "query_internal",
532
+ "params": {"department": "procurement", "question": "Did you approve this price?"}
533
+ }
534
+
535
+ {
536
+ "type": "apply_rule",
537
+ "params": {"rule_id": "tolerance_exception_approval"}
538
+ }
539
+
540
+ {
541
+ "type": "make_decision",
542
+ "params": {"decision": "approve", "reason": "Verbal approval confirmed by procurement."}
543
+ }
544
+
545
+ {
546
+ "type": "route_to",
547
+ "params": {"team": "procurement", "notes": "Please raise PO amendment for the price variance."}
548
+ }
549
+
550
+ {
551
+ "type": "close_case",
552
+ "params": {"summary": "Invoice approved. PO amendment requested. Case closed."}
553
+ }
554
+ ```
555
+
556
+ ---
557
+
558
+ ## 12. File Structure
559
+
560
+ ```
561
+ invoice-exception-handler/
562
+
563
+ ├── README.md # Full setup + usage guide
564
+ ├── openenv.yaml # OpenEnv spec (must pass openenv validate)
565
+ ├── Dockerfile # Single-stage Python 3.11-slim build
566
+ ├── requirements.txt # Pinned dependencies
567
+ ├── inference.py # Competition inference script (MUST be here)
568
+ ├── app.py # Gradio + FastAPI entrypoint for HF Spaces
569
+
570
+ ├── env/
571
+ │ ├── __init__.py
572
+ │ ├── models.py # All Pydantic typed models
573
+ │ ├── environment.py # InvoiceExceptionEnv class
574
+ │ └── tasks.py # 3 task classes + graders + EpisodeData
575
+
576
+ └── documents/
577
+ ├── PRD-001-product-requirements.md # This document
578
+ ├── CHANGELOG.md # Every code change recorded
579
+ ├── ARCHITECTURE.md # System diagram + decisions
580
+ └── BASELINE-SCORES.md # Reproducible benchmark results
581
+ ```
582
+
583
+ ---
584
+
585
+ ## 13. Out of Scope
586
+
587
+ The following are explicitly not part of v1.0:
588
+
589
+ - Real database connectivity (the environment is fully simulated)
590
+ - Multi-agent scenarios (one agent per episode)
591
+ - Partial observability (agent sees all documents from the start)
592
+ - User interface for human play (nice-to-have but not required for submission)
593
+ - Real supplier APIs (simulation only)
594
+ - Currency other than INR (can be extended in v1.1)
595
+ - Tasks beyond 3 (can be extended)
596
+
597
+ ---
598
+
599
+ ## 14. Change Log
600
+
601
+ | Version | Date | Author | Change |
602
+ |---|---|---|---|
603
+ | 0.1.0 | 2025-01-18 | [Author] | Initial draft — problem definition and task sketches |
604
+ | 0.2.0 | 2025-01-19 | [Author] | Added reward design section, API contract, file structure |
605
+ | 1.0.0 | 2025-01-20 | [Author] | Final version — all sections complete, ready for implementation |
documents/README.md ADDED
@@ -0,0 +1,1610 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Invoice Exception Handler — OpenEnv
2
+
3
+ > An AI agent learning environment that simulates accounts payable exception handling.
4
+ > The agent acts as an AP analyst: receives flagged invoices, investigates root causes,
5
+ > makes decisions, and closes cases. Built for the OpenEnv hackathon.
6
+
7
+ [![OpenEnv](https://img.shields.io/badge/OpenEnv-1.0.0-blue)](https://github.com/openenv/openenv)
8
+ [![Python](https://img.shields.io/badge/Python-3.11+-green)](https://python.org)
9
+ [![HF Space](https://img.shields.io/badge/HF%20Space-Live-yellow)](https://huggingface.co/spaces)
10
+
11
+ ---
12
+
13
+ ## For Agents Building This Project
14
+
15
+ This README is the single source of truth for building the entire project from scratch.
16
+ Read every section before writing any code. Do not skip sections. Do not guess.
17
+
18
+ ### Ground Rules
19
+
20
+ 1. **Write code like a human wrote it.** Use real variable names, not `x` or `tmp`. Add comments where
21
+ the logic is non-obvious. Leave one blank line between logical blocks inside functions. Use 4-space
22
+ indentation everywhere. Python files get a module docstring at the top explaining what the file does.
23
+
24
+ 2. **Create a new Git repo and push after every major milestone.** A milestone is: models done,
25
+ tasks done, environment done, API done, inference done, app done. Not after every file.
26
+
27
+ 3. **Record every change in `documents/CHANGELOG.md`.** Use the format in the changelog section below.
28
+ Before pushing, append to the changelog what changed and why.
29
+
30
+ 4. **If something in this README conflicts with the competition spec, the competition spec wins.**
31
+ The competition spec is in the document the user shared. Key points: `inference.py` must use the
32
+ OpenAI client. `[START]` `[STEP]` `[END]` format must be exact. `/reset` must return 200.
33
+
34
+ 5. **Test before pushing.** Run `python -c "from env import InvoiceExceptionEnv"` to check imports.
35
+ Run `python inference.py` with a dummy API key to check the log format. Run `docker build .` to
36
+ check the Dockerfile before claiming it works.
37
+
38
+ ---
39
+
40
+ ## What This Environment Does
41
+
42
+ Every company that buys goods or services deals with invoice exceptions — mismatches between
43
+ what was ordered, what arrived, and what was invoiced. Currently these are handled by accounts
44
+ payable analysts who manually compare documents and decide: approve, reject, hold, or escalate.
45
+
46
+ This environment puts an AI agent in that analyst's chair. The agent receives:
47
+ - A **Purchase Order** (what was agreed to)
48
+ - An **Invoice** (what the supplier is claiming)
49
+ - A **Goods Receipt Note** (what actually arrived)
50
+ - A **Supplier Master** (the verified supplier record)
51
+ - An **Exception Flag** (why the system flagged this invoice)
52
+
53
+ The agent investigates, runs checks, queries people, makes a decision, and closes the case.
54
+ Every action has realistic consequences including financial, compliance, and fraud implications.
55
+
56
+ ---
57
+
58
+ ## Repository Structure
59
+
60
+ Build the project with exactly this structure. Do not add extra directories. Do not rename files.
61
+
62
+ ```
63
+ invoice-exception-handler/
64
+
65
+ ├── README.md ← this file
66
+ ├── openenv.yaml ← OpenEnv spec, must pass openenv validate
67
+ ├── Dockerfile ← single-stage Python 3.11-slim
68
+ ├── requirements.txt ← pinned versions
69
+ ├── inference.py ← competition script, MUST be named this
70
+ ├── app.py ← Gradio + FastAPI, entry point for HF Spaces
71
+
72
+ ├── env/
73
+ │ ├── __init__.py ← exports InvoiceExceptionEnv, Action, ALL_TASKS
74
+ │ ├── models.py ← all Pydantic models (Action, EnvironmentState, etc.)
75
+ │ ├── environment.py ← InvoiceExceptionEnv class
76
+ │ └── tasks.py ← 3 task classes, EpisodeData, graders
77
+
78
+ └── documents/
79
+ ├── PRD-001-product-requirements.md
80
+ ├── CHANGELOG.md
81
+ ├── ARCHITECTURE.md
82
+ └── BASELINE-SCORES.md
83
+ ```
84
+
85
+ ---
86
+
87
+ ## Step-by-Step Build Order
88
+
89
+ Follow this order exactly. Do not jump ahead.
90
+
91
+ ```
92
+ Step 1 → Create the repo
93
+ Step 2 → Write requirements.txt
94
+ Step 3 → Write env/models.py
95
+ Step 4 → Write env/tasks.py
96
+ Step 5 → Write env/environment.py
97
+ Step 6 → Write env/__init__.py
98
+ Step 7 → Smoke test the environment (run a quick script)
99
+ Step 8 → Write openenv.yaml
100
+ Step 9 → Write inference.py
101
+ Step 10 → Write app.py
102
+ Step 11 → Write Dockerfile
103
+ Step 12 → Full end-to-end test
104
+ Step 13 → Write documents/
105
+ Step 14 → Push and verify
106
+ ```
107
+
108
+ ---
109
+
110
+ ## Step 1 — Create the Repo
111
+
112
+ ```bash
113
+ # Create the project directory
114
+ mkdir invoice-exception-handler
115
+ cd invoice-exception-handler
116
+
117
+ # Initialise git
118
+ git init
119
+ git checkout -b main
120
+
121
+ # Create the directory structure
122
+ mkdir -p env documents
123
+
124
+ # Create empty placeholder files so git tracks the structure
125
+ touch env/__init__.py
126
+ touch documents/.gitkeep
127
+
128
+ # First commit — skeleton only
129
+ git add .
130
+ git commit -m "init: project skeleton"
131
+
132
+ # Create the repo on GitHub/HF and push
133
+ # Replace with your actual remote
134
+ git remote add origin https://github.com/YOUR_USERNAME/invoice-exception-handler.git
135
+ git push -u origin main
136
+ ```
137
+
138
+ ---
139
+
140
+ ## Step 2 — requirements.txt
141
+
142
+ Pin every version. Do not use `>=` ranges — the validator builds in a clean environment and
143
+ range mismatches cause mysterious failures.
144
+
145
+ ```
146
+ pydantic==2.7.1
147
+ fastapi==0.111.0
148
+ uvicorn==0.29.0
149
+ gradio==4.36.1
150
+ openai==1.35.3
151
+ pyyaml==6.0.1
152
+ httpx==0.27.0
153
+ python-multipart==0.0.9
154
+ ```
155
+
156
+ ---
157
+
158
+ ## Step 3 — env/models.py
159
+
160
+ This file defines every typed object in the system. Write it before any other Python code.
161
+ Nothing is untyped. Every field has a type annotation.
162
+
163
+ ### What goes in models.py
164
+
165
+ **Enumerations:**
166
+ - `ActionType` — the 9 action types an agent can take (string enum)
167
+ - `DecisionType` — approve / reject / hold / partial_approve (string enum)
168
+ - `CaseStatus` — open / in_review / decided / routed / closed (string enum)
169
+
170
+ **Document models** (read-only context given to the agent):
171
+ - `LineItem` — one line on an invoice or PO (description, quantity, unit_price, total, tax_rate)
172
+ - `PurchaseOrder` — what was agreed to be purchased
173
+ - `Invoice` — what the supplier is claiming
174
+ - `GoodsReceiptNote` — what actually arrived at the warehouse
175
+ - `SupplierMaster` — the verified, registered supplier record
176
+ - `ExceptionFlag` — why the system flagged this invoice (flag_code, description, auto_hold)
177
+
178
+ **Action model:**
179
+ - `Action` — has a `type: ActionType` and `params: Dict[str, Any]`
180
+ - Add classmethod constructors for each action type so callers can do `Action.run_check("tolerance_rule")`
181
+
182
+ **Result models:**
183
+ - `InspectionResult` — what came back from inspect_field (document, field, value, note, timestamp)
184
+ - `CheckResult` — what came back from run_check or cross_check (check_name, passed, detail, timestamp)
185
+ - `QueryResult` — what came back from a query (target, question, response, channel, timestamp)
186
+
187
+ **State models:**
188
+ - `EnvironmentState` — the full observable state returned by reset() and step()
189
+ - `StepResult` — what step() returns: (observation, reward, done, info)
190
+
191
+ ### EnvironmentState fields
192
+
193
+ The EnvironmentState must include:
194
+ - `task_id: str`
195
+ - `step_number: int`
196
+ - `case_status: CaseStatus`
197
+ - All 5 documents (purchase_order, invoice, grn, supplier_master, exception_flag)
198
+ - Agent history: `inspections`, `checks_run`, `queries`, `rules_applied`
199
+ - Decision state: `decision`, `decision_reason`, `routed_to`, `case_closed`, `close_summary`
200
+ - Action hints: `available_actions`, `available_checks`, `available_rules`, `knowledge_base`
201
+ - `cumulative_reward: float`
202
+
203
+ ### Writing style for models.py
204
+
205
+ ```python
206
+ """
207
+ Typed models for the Invoice Exception Handler OpenEnv environment.
208
+
209
+ Every object the agent sees or produces is defined here as a Pydantic model.
210
+ This is the single source of truth for the data contract between the
211
+ environment simulation and the agent.
212
+ """
213
+ from __future__ import annotations
214
+
215
+ import time
216
+ from enum import Enum
217
+ from typing import Any, Dict, List, Optional
218
+
219
+ from pydantic import BaseModel, Field
220
+
221
+
222
+ class ActionType(str, Enum):
223
+ INSPECT_FIELD = "inspect_field"
224
+ CROSS_CHECK = "cross_check"
225
+ # ... etc
226
+ ```
227
+
228
+ Do not put business logic in models.py. Just data shapes.
229
+
230
+ ---
231
+
232
+ ## Step 4 — env/tasks.py
233
+
234
+ This is the biggest file. It defines what happens when the agent takes each action —
235
+ the simulated responses, the rewards, and the grading logic.
236
+
237
+ ### EpisodeData class
238
+
239
+ A plain Python class (not Pydantic) that tracks everything the agent has done in one episode.
240
+
241
+ ```python
242
+ class EpisodeData:
243
+ """Tracks the full history of one episode for grading and state building."""
244
+
245
+ def __init__(self):
246
+ self.inspections: List[InspectionResult] = []
247
+ self.checks: List[CheckResult] = []
248
+ self.queries: List[QueryResult] = []
249
+ self.rules_applied: List[str] = []
250
+ self.decision: Optional[str] = None
251
+ self.decision_reason: Optional[str] = None
252
+ self.routed_to: List[str] = []
253
+ self.closed: bool = False
254
+ self.close_summary: Optional[str] = None
255
+ self.step_count: int = 0
256
+ self.cumulative_reward: float = 0.0
257
+
258
+ def has_inspected(self, doc: str, field: str) -> bool:
259
+ """Check if we already looked at this field in this document."""
260
+ return any(i.document == doc and i.field == field for i in self.inspections)
261
+
262
+ def has_checked(self, name: str) -> bool:
263
+ """Check if this validation check has already been run."""
264
+ return any(c.check_name == name for c in self.checks)
265
+
266
+ def has_queried(self, target: str) -> bool:
267
+ """Check if we already queried this person or department."""
268
+ return any(q.target == target for q in self.queries)
269
+ ```
270
+
271
+ ### BaseTask class
272
+
273
+ Abstract base that all three tasks inherit from. Every method raises `NotImplementedError`.
274
+
275
+ ```python
276
+ class BaseTask:
277
+ task_id: str = "base"
278
+ max_steps: int = 20
279
+ difficulty: str = "easy"
280
+
281
+ # Document factories — return fresh objects each time (no shared state)
282
+ def get_purchase_order(self) -> PurchaseOrder: raise NotImplementedError
283
+ def get_invoice(self) -> Invoice: raise NotImplementedError
284
+ def get_grn(self) -> GoodsReceiptNote: raise NotImplementedError
285
+ def get_supplier_master(self) -> SupplierMaster: raise NotImplementedError
286
+ def get_exception_flag(self) -> ExceptionFlag: raise NotImplementedError
287
+
288
+ # Simulators — each returns (result_object, reward_delta)
289
+ def simulate_inspect(self, document: str, field: str) -> Tuple[InspectionResult, float]: ...
290
+ def simulate_cross_check(self, field: str, doc_a: str, doc_b: str) -> Tuple[CheckResult, float]: ...
291
+ def simulate_run_check(self, check_name: str) -> Tuple[CheckResult, float]: ...
292
+ def simulate_query_supplier(self, question: str, channel: str) -> Tuple[QueryResult, float]: ...
293
+ def simulate_query_internal(self, department: str, question: str) -> Tuple[QueryResult, float]: ...
294
+ def simulate_apply_rule(self, rule_id: str) -> Tuple[str, float]: ...
295
+ def simulate_make_decision(self, decision: str, reason: str, ep: EpisodeData) -> float: ...
296
+ def simulate_route_to(self, team: str, notes: str, ep: EpisodeData) -> float: ...
297
+ def simulate_close(self, summary: str, ep: EpisodeData) -> float: ...
298
+ def grade(self, ep: EpisodeData) -> Dict[str, float]: ...
299
+
300
+ # These are properties, not methods
301
+ @property
302
+ def available_checks(self) -> List[str]: return []
303
+
304
+ @property
305
+ def available_rules(self) -> List[str]: return []
306
+
307
+ @property
308
+ def knowledge_base(self) -> List[str]: return []
309
+ ```
310
+
311
+ ### The Three Tasks
312
+
313
+ #### Task 1: PriceVarianceTask (task1_price_variance)
314
+
315
+ **The scenario:** An office stationery supplier sends an invoice that's 3.08% above the PO.
316
+ Company policy allows ±2% automatic approval. Above that needs manual exception approval.
317
+ The supplier did communicate the price increase but procurement never updated the PO.
318
+
319
+ **task_id:** `"task1_price_variance"`
320
+ **max_steps:** `18`
321
+ **difficulty:** `"easy"`
322
+
323
+ **The documents:**
324
+
325
+ PO (PO-2024-1041): 3 stationery line items totalling ₹50,000
326
+ - A4 Paper 100 reams @ ₹220 = ₹22,000
327
+ - Ballpoint Pens 20 boxes @ ₹450 = ₹9,000
328
+ - Staplers 10 units @ ₹1,900 = ₹19,000
329
+
330
+ Invoice (INV-ON-8821): Same items, same quantities, but 2 items have higher unit prices
331
+ - A4 Paper @ ₹231 (+₹11, +5.0%)
332
+ - Ballpoint Pens @ ₹472 (+₹22, +4.9%)
333
+ - Staplers unchanged @ ₹1,900
334
+ - Subtotal: ₹51,540 (+₹1,540, +3.08%)
335
+ - 18% GST applied correctly: ₹9,277.20
336
+ - Total: ₹60,817.20
337
+
338
+ GRN (GRN-2024-0892): All items fully received, no pending, no rejected.
339
+
340
+ Supplier Master (SUP-0441 — OfficeNeed Supplies): Bank account and GSTIN both match invoice exactly. No fraud signals.
341
+
342
+ Exception Flag: `PRICE_MISMATCH` — "Invoice total ₹51,540 exceeds PO ₹50,000 by ₹1,540 (3.08%). Above auto-approval threshold."
343
+
344
+ **Knowledge base entries:**
345
+ - POL-001: Price variance ≤±2% may be auto-approved. Above 2% requires exception approval.
346
+ - POL-002: Exception approval requires confirmation from originating department.
347
+ - POL-003: Any approved invoice with a price change must be followed by a PO amendment request.
348
+ - POL-004: Bank account on invoice must match supplier master.
349
+
350
+ **Simulator logic:**
351
+
352
+ `simulate_inspect`: Return meaningful values for invoice line_items (+0.10), invoice total_amount (+0.08), po line_items (+0.06), grn items_received (+0.05). Return +0.01 for unknown fields.
353
+
354
+ `simulate_cross_check`: The key cross-checks are:
355
+ - `(unit_price, invoice, po)` → finds Paper and Pen mismatch, reward +0.12
356
+ - `(total_amount, invoice, po)` → confirms 3.08% variance, reward +0.10
357
+ - `(bank_account, invoice, supplier_master)` → match (no fraud), reward +0.03
358
+ - `(gstin, invoice, supplier_master)` → match, reward +0.02
359
+ - `(quantity, invoice, grn)` → match (full delivery), reward +0.04
360
+
361
+ `simulate_run_check`:
362
+ - `"tolerance_rule"` → 3.08% > 2%, FAILS, reward +0.14 (most important check)
363
+ - `"grn_match"` → PASSES (all received), reward +0.06
364
+ - `"duplicate_detection"` → PASSES (not a dup), reward +0.02
365
+ - `"bank_account_verification"` → PASSES, reward +0.02
366
+ - `"gst_verification"` → PASSES, reward +0.02
367
+ - `"po_match"` → FAILS on price, reward +0.08
368
+
369
+ `simulate_query_supplier`: Returns email from supplier explaining raw material price increase communicated to Arjun Mehta at procurement on Feb 20. Reward +0.10.
370
+
371
+ `simulate_query_internal`:
372
+ - `"procurement"` → Arjun Mehta confirms verbal approval, says he'll raise PO amendment. Reward +0.12.
373
+ - Others → generic responses, reward +0.03.
374
+
375
+ `simulate_apply_rule`:
376
+ - `"tolerance_2pct_auto_approve"` → BLOCKED (3.08% > 2%), reward −0.05
377
+ - `"tolerance_exception_approval"` → APPLIED, reward +0.10
378
+ - `"rejection_with_reason"` → APPLIED but wrong, reward −0.08
379
+ - `"partial_approval"` → not applicable here, reward −0.05
380
+
381
+ `simulate_make_decision`:
382
+ - `"approve"` with tolerance check + procurement query: reward +0.25
383
+ - `"approve"` with tolerance check only: reward +0.18
384
+ - `"approve"` with nothing checked: reward +0.05 (bad approval, should have verified)
385
+ - `"reject"`: reward −0.10 (wrong decision, delay supplier)
386
+ - `"hold"`: reward +0.08
387
+
388
+ `simulate_route_to`:
389
+ - `"procurement"` → reward +0.12 (correct — PO amendment needed)
390
+ - `"finance"` → reward +0.03
391
+ - `"legal"` → reward −0.05 (overkill for a price variance)
392
+
393
+ `simulate_close`: reward +0.12 if approved + tolerance checked + procurement routed, else +0.06, else 0.
394
+
395
+ **Grader (`grade` method):**
396
+ ```python
397
+ def grade(self, ep: EpisodeData) -> Dict[str, float]:
398
+ checks_run = {c.check_name for c in ep.checks}
399
+ queries_to = {q.target for q in ep.queries}
400
+
401
+ # Did the agent correctly diagnose?
402
+ d = 0.0
403
+ if any("unit_price" in c.check_name or "total" in c.check_name
404
+ for c in ep.checks):
405
+ d += 0.12
406
+ if "tolerance_rule" in checks_run:
407
+ d += 0.14
408
+ if "grn_match" in checks_run:
409
+ d += 0.06
410
+
411
+ # Did the agent investigate properly?
412
+ i = 0.0
413
+ if "supplier" in queries_to:
414
+ i += 0.10
415
+ if "procurement" in queries_to:
416
+ i += 0.12
417
+ if "tolerance_exception_approval" in ep.rules_applied:
418
+ i += 0.08
419
+
420
+ # Correct decision?
421
+ dec = 0.0
422
+ if ep.decision == "approve": dec += 0.18
423
+ elif ep.decision == "hold": dec += 0.06
424
+ elif ep.decision == "reject": dec -= 0.10
425
+
426
+ # Correct routing?
427
+ route = 0.12 if "procurement" in ep.routed_to else 0.0
428
+
429
+ # Closed cleanly?
430
+ closure = 0.08 if ep.closed else 0.0
431
+
432
+ # Efficiency bonus — penalise extra steps
433
+ eff = max(0.0, 0.06 - 0.004 * max(0, ep.step_count - 9))
434
+
435
+ total = d + i + dec + route + closure + eff
436
+ return {
437
+ "score": round(max(0.0, min(1.0, total)), 4),
438
+ "diagnosis_score": round(d, 4),
439
+ "investigation_score": round(i, 4),
440
+ "decision_score": round(dec, 4),
441
+ "routing_score": round(route, 4),
442
+ "closure_score": round(closure, 4),
443
+ "efficiency_score": round(eff, 4),
444
+ }
445
+ ```
446
+
447
+ ---
448
+
449
+ #### Task 2: DuplicateTaxErrorTask (task2_duplicate_tax)
450
+
451
+ **The scenario:** Logistics supplier submits INV-2024-891 for transport services. System flags
452
+ it as a possible duplicate. Turns out it IS a duplicate of INV-2024-819 — the numbers differ
453
+ by digit transposition (891 vs 819). That original invoice was already paid. BUT: the original
454
+ invoice applied 15% GST when the correct rate is 18%. The company overpaid ₹3,240 in tax.
455
+ The new invoice has the correct rate. So it's both a duplicate AND a legitimate correction.
456
+
457
+ **task_id:** `"task2_duplicate_tax"`
458
+ **max_steps:** `20`
459
+ **difficulty:** `"medium"`
460
+
461
+ **The documents:**
462
+
463
+ PO (PO-2024-0778): Logistics services
464
+ - Mumbai-Pune Transport 20 trips @ ₹4,500 = ₹90,000
465
+ - Warehousing charges Feb 2024 @ ₹18,000 = ₹18,000
466
+ - Total: ₹1,08,000, Net-15 terms
467
+
468
+ Invoice (INV-2024-891): Same services, same amounts — correct on the face of it
469
+ - Subtotal: ₹1,08,000
470
+ - GST 18%: ₹19,440 ← this is CORRECT
471
+ - Total: ₹1,27,440
472
+
473
+ GRN (GRN-2024-0740): Services confirmed complete (transport + warehousing).
474
+
475
+ Supplier Master (SUP-0229 — FastMove Logistics): Bank and GSTIN match invoice. No fraud signals.
476
+
477
+ Exception Flag: `POSSIBLE_DUPLICATE` — "Invoice INV-2024-891 closely matches previously processed invoice."
478
+
479
+ **Hidden state (not in documents, revealed by checks):**
480
+ - INV-2024-819 was paid 12 days ago for ₹1,24,200
481
+ - INV-2024-819 applied 15% GST = ₹16,200 (wrong rate)
482
+ - Correct 18% GST = ₹19,440
483
+ - Company overpaid: ₹3,240
484
+
485
+ **Key checks and what they reveal:**
486
+
487
+ `run_check("duplicate_detection")` → FAILS → finds INV-2024-819 paid 12 days ago, reward +0.18
488
+
489
+ `run_check("tax_calculation_verify")` → FAILS → discovers the 15% error on original, reveals ₹3,240 delta, reward +0.16
490
+
491
+ `cross_check(invoice_number, invoice, payment_history)` → finds digit transposition, reward +0.15
492
+
493
+ `cross_check(tax_amount, invoice, payment_history)` → confirms ₹3,240 delta, reward +0.14
494
+
495
+ `query_internal("finance")` → confirms overpayment on original, reward +0.12
496
+
497
+ `query_supplier` → supplier confirms they know and wants partial approval for the delta, reward +0.10
498
+
499
+ `apply_rule("partial_approval")` → correct pathway, reward +0.12
500
+
501
+ `apply_rule("credit_note_request")` → supplier must issue credit note for the balance, reward +0.10
502
+
503
+ **Decision logic:**
504
+
505
+ `simulate_make_decision`:
506
+ - `"partial_approve"` with dup + tax found: reward +0.28 ← optimal
507
+ - `"partial_approve"` with dup only: reward +0.14 ← incomplete
508
+ - `"reject"` with dup found: reward +0.08 ← catches dup, misses correction
509
+ - `"approve"` (pays full duplicate): reward −0.15 ← bad
510
+
511
+ **Grader weights:**
512
+ - diagnosis_score: up to 0.30 (dup found +0.16, tax error found +0.14)
513
+ - investigation_score: up to 0.32 (finance queried, supplier queried, rules applied)
514
+ - decision_score: up to 0.20 (partial_approve = 0.20, reject = 0.05, approve = −0.15)
515
+ - routing_score: up to 0.08
516
+ - closure_score: up to 0.06
517
+
518
+ ---
519
+
520
+ #### Task 3: CompoundFraudTask (task3_compound_fraud)
521
+
522
+ **The scenario:** IT supplier submits ₹8,47,500 invoice for 15 laptops. System flags a bank
523
+ account change. But there are FOUR simultaneous fraud signals that the agent must find all of.
524
+
525
+ **task_id:** `"task3_compound_fraud"`
526
+ **max_steps:** `25`
527
+ **difficulty:** `"hard"`
528
+
529
+ **The four signals:**
530
+
531
+ 1. **Bank account fraud (Signal 1):** Invoice has a different bank account than the supplier
532
+ master. The change request came from `techcore-solutions.com`. The registered domain is
533
+ `techcore-solutions.in`. Classic Business Email Compromise (BEC) attack.
534
+
535
+ 2. **GSTIN fraud (Signal 2):** The GST number on the invoice (`07AABCT9999X1Z8`) belongs to
536
+ "TechCore Trading Pvt Ltd" — a completely different entity in Delhi. Supplier master shows
537
+ `07AABCT1234Y1Z5` for "TechCore Solutions."
538
+
539
+ 3. **Quantity mismatch (Signal 3):** Invoice claims 15 laptops. GRN shows only 13 received.
540
+ 2 units are still marked as pending.
541
+
542
+ 4. **Price inflation (Signal 4):** ₹56,500/unit on invoice vs ₹52,000/unit on PO. That's
543
+ 8.65% above the agreed price. No price revision was ever approved.
544
+
545
+ **Bonus signals (smaller, still notable):**
546
+ - Invoice is dated a Sunday (2024-03-10) — unusual for B2B
547
+ - PO was raised Friday March 8 — 2-day turnaround is suspiciously fast for IT equipment
548
+
549
+ **The critical trap — channel selection:**
550
+
551
+ `simulate_query_supplier(question, channel="email")` →
552
+ Returns fraudster's response urging payment to the new account. Reward: **−0.15**.
553
+
554
+ `simulate_query_supplier(question, channel="phone")` →
555
+ The real TechCore Solutions confirms they sent no bank change request. Confirms fraud. Reward: **+0.15**.
556
+
557
+ This tests whether the agent follows POL-009 ("bank account change must be verified via
558
+ registered phone number — NEVER via email") which is in the knowledge base.
559
+
560
+ **Available checks and rewards:**
561
+
562
+ ```python
563
+ "bank_account_verification" → FAILS, finds lookalike domain, reward +0.18
564
+ "gst_verification" → FAILS, GST belongs to different entity, reward +0.18
565
+ "grn_match" → FAILS, 13 vs 15 received, reward +0.14
566
+ "email_domain_verification" → FAILS, lookalike domain confirmed, reward +0.16
567
+ "invoice_date_validation" → FAILS, Sunday flag, reward +0.08
568
+ "quantity_check" → FAILS, quantity inflated, reward +0.12
569
+ "price_check" → FAILS, 8.65% above PO, reward +0.10
570
+ "duplicate_detection" → PASSES (not a dup), reward +0.02
571
+ "po_match" → FAILS (GST + qty + price all wrong), reward +0.08
572
+ ```
573
+
574
+ **Decision logic:**
575
+
576
+ `simulate_make_decision`:
577
+ - `"reject"` → reward = 0.10 + 0.05 × (number of signals found) → max ~0.30
578
+ - `"approve"` → reward −0.40 (catastrophic — approved fraud)
579
+ - `"partial_approve"` → reward −0.20 (you can't partially approve fraud)
580
+ - `"hold"` → reward = 0.08 + 0.03 × signals found → acceptable but not optimal
581
+
582
+ **Route logic:**
583
+
584
+ ```python
585
+ "legal" → reward +0.14 # must escalate to legal
586
+ "security" → reward +0.12 # BEC attack needs security investigation
587
+ "finance" → reward +0.08 # finance needs to block payment
588
+ "procurement" → reward +0.06
589
+ ```
590
+
591
+ **Grader — the signal detection scoring:**
592
+
593
+ ```python
594
+ def grade(self, ep: EpisodeData) -> Dict[str, float]:
595
+ failed = {c.check_name for c in ep.checks if not c.passed}
596
+
597
+ bank_found = "bank_account_verification" in {c.check_name for c in ep.checks}
598
+ gst_found = "gst_verification" in {c.check_name for c in ep.checks}
599
+ qty_found = "grn_match" in {c.check_name for c in ep.checks}
600
+ domain_found = "email_domain_verification" in {c.check_name for c in ep.checks}
601
+ price_found = "price_check" in {c.check_name for c in ep.checks}
602
+
603
+ # Diagnosis — finding all signals is the whole point
604
+ d = (0.12 if bank_found else 0) + (0.12 if gst_found else 0) \
605
+ + (0.10 if qty_found else 0) + (0.10 if domain_found else 0) \
606
+ + (0.06 if price_found else 0)
607
+
608
+ # Investigation — reward for using phone not email
609
+ i = 0.0
610
+ for q in ep.queries:
611
+ if q.target == "supplier" and q.channel not in ("email", "mail"):
612
+ i += 0.10 # correct channel
613
+ elif q.target == "supplier" and q.channel in ("email", "mail"):
614
+ i -= 0.15 # contacting fraudster
615
+ if "legal" in {q.target for q in ep.queries}: i += 0.06
616
+ if "security" in {q.target for q in ep.queries}: i += 0.06
617
+
618
+ # Decision
619
+ signals = sum([bank_found, gst_found, qty_found, domain_found])
620
+ dec = 0.0
621
+ if ep.decision == "reject":
622
+ dec = 0.08 + 0.03 * signals
623
+ elif ep.decision == "approve":
624
+ dec = -0.35
625
+ elif ep.decision == "partial_approve":
626
+ dec = -0.15
627
+ elif ep.decision == "hold":
628
+ dec = 0.06
629
+
630
+ # Routing
631
+ routes = set(ep.routed_to)
632
+ route = (0.10 if "legal" in routes else 0) \
633
+ + (0.06 if "security" in routes else 0) \
634
+ + (0.04 if "finance" in routes else 0)
635
+
636
+ closure = 0.06 if (ep.closed and ep.decision == "reject") else 0.0
637
+ eff = max(0.0, 0.04 - 0.002 * max(0, ep.step_count - 12))
638
+
639
+ total = d + i + dec + route + closure + eff
640
+ return {
641
+ "score": round(max(0.0, min(1.0, total)), 4),
642
+ "signals_found": sum([bank_found, gst_found, qty_found, domain_found, price_found]),
643
+ "diagnosis_score": round(d, 4),
644
+ "investigation_score": round(i, 4),
645
+ "decision_score": round(dec, 4),
646
+ "routing_score": round(route, 4),
647
+ "closure_score": round(closure, 4),
648
+ "efficiency_score": round(eff, 4),
649
+ }
650
+ ```
651
+
652
+ ### Task Registry
653
+
654
+ At the bottom of tasks.py:
655
+
656
+ ```python
657
+ TASK_REGISTRY: Dict[str, type] = {
658
+ "task1_price_variance": PriceVarianceTask,
659
+ "task2_duplicate_tax": DuplicateTaxErrorTask,
660
+ "task3_compound_fraud": CompoundFraudTask,
661
+ }
662
+
663
+ ALL_TASKS = list(TASK_REGISTRY.keys())
664
+
665
+ def make_task(task_id: str) -> BaseTask:
666
+ cls = TASK_REGISTRY.get(task_id)
667
+ if cls is None:
668
+ raise ValueError(f"Unknown task '{task_id}'. Available: {ALL_TASKS}")
669
+ return cls()
670
+ ```
671
+
672
+ ---
673
+
674
+ ## Step 5 — env/environment.py
675
+
676
+ This is the `InvoiceExceptionEnv` class. It is the only thing external code needs to import.
677
+
678
+ ```python
679
+ class InvoiceExceptionEnv:
680
+ """
681
+ OpenEnv-compatible Invoice Exception Handler environment.
682
+
683
+ Usage:
684
+ env = InvoiceExceptionEnv(seed=42)
685
+ obs = env.reset("task1_price_variance")
686
+ result = env.step(Action.run_check("tolerance_rule"))
687
+ scores = env.grade()
688
+ """
689
+ ```
690
+
691
+ ### Constructor
692
+
693
+ Takes an optional `seed: Optional[int] = None` for reproducibility.
694
+ Initialises `self._rng = random.Random(seed)`.
695
+ Initialises `self._task`, `self._ep`, `self._state`, `self._done` all to None/False.
696
+
697
+ ### reset(task_id)
698
+
699
+ ```python
700
+ def reset(self, task_id: Optional[str] = None) -> EnvironmentState:
701
+ """
702
+ Start a new episode. If task_id is None, picks one at random.
703
+ Returns the initial EnvironmentState showing all documents and available actions.
704
+ """
705
+ ```
706
+
707
+ 1. Pick task (random if None)
708
+ 2. Create `EpisodeData()`
709
+ 3. Set `self._done = False`
710
+ 4. Call `self._build_state()` and store result
711
+ 5. Return the state
712
+
713
+ ### step(action)
714
+
715
+ ```python
716
+ def step(self, action: Union[Action, Dict[str, Any]]) -> StepResult:
717
+ """
718
+ Execute one action. Returns observation, reward, done flag, and info dict.
719
+ Raises RuntimeError if called before reset() or after the episode is done.
720
+ """
721
+ ```
722
+
723
+ 1. Validate we're in an active episode
724
+ 2. Convert dict to Action if needed
725
+ 3. Call `self._dispatch(action)` → gets (reward, info)
726
+ 4. Increment step count
727
+ 5. Check SLA (step count vs max_steps)
728
+ 6. Check done condition (closed or SLA breach)
729
+ 7. Rebuild state
730
+ 8. Return StepResult
731
+
732
+ ### state()
733
+
734
+ Non-destructive. Just returns `self._state`. Raises RuntimeError if not initialised.
735
+
736
+ ### grade()
737
+
738
+ Calls `self._task.grade(self._ep)` and returns the dict.
739
+
740
+ ### _dispatch(action)
741
+
742
+ The routing function. A single if/elif chain for each ActionType.
743
+
744
+ For each action:
745
+ 1. Call the appropriate task simulator
746
+ 2. Update EpisodeData
747
+ 3. Return (reward, info dict)
748
+
749
+ Handle repeated actions (inspect same field twice, check same thing twice) with a small −0.02 to −0.05 penalty and return early.
750
+
751
+ ### _build_state()
752
+
753
+ Constructs an `EnvironmentState` from the current `_task` and `_ep`. Called after every step.
754
+ Also determines the current `CaseStatus` based on episode data.
755
+
756
+ ### action_space_sample()
757
+
758
+ Returns a random valid action (for random baseline agents). Uses `self._rng` for reproducibility.
759
+
760
+ ---
761
+
762
+ ## Step 6 — env/__init__.py
763
+
764
+ ```python
765
+ from .environment import InvoiceExceptionEnv
766
+ from .models import Action, ActionType, EnvironmentState, StepResult
767
+ from .tasks import ALL_TASKS, make_task
768
+
769
+ __all__ = [
770
+ "InvoiceExceptionEnv",
771
+ "Action",
772
+ "ActionType",
773
+ "EnvironmentState",
774
+ "StepResult",
775
+ "ALL_TASKS",
776
+ "make_task",
777
+ ]
778
+ ```
779
+
780
+ ---
781
+
782
+ ## Step 7 — Smoke Test Before Continuing
783
+
784
+ Before writing openenv.yaml or inference.py, verify the environment works.
785
+
786
+ ```python
787
+ # test_smoke.py — run this, do not commit it
788
+ from env import InvoiceExceptionEnv, Action, ALL_TASKS
789
+
790
+ print("Tasks:", ALL_TASKS)
791
+
792
+ env = InvoiceExceptionEnv(seed=42)
793
+
794
+ for task_id in ALL_TASKS:
795
+ obs = env.reset(task_id)
796
+ print(f"\n--- {task_id} ---")
797
+ print("Ticket:", obs.exception_flag.flag_description[:80])
798
+
799
+ # Take a few actions
800
+ r1 = env.step(Action.run_check(obs.available_checks[0]))
801
+ print(f"Step 1 reward: {r1.reward}")
802
+
803
+ r2 = env.step(Action.make_decision("approve", "test"))
804
+ print(f"Step 2 reward: {r2.reward}")
805
+
806
+ r3 = env.step(Action.close_case("closed"))
807
+ print(f"Step 3 reward: {r3.reward}, done: {r3.done}")
808
+
809
+ scores = env.grade()
810
+ print(f"Grade: {scores['score']}")
811
+
812
+ print("\nSmoke test passed.")
813
+ ```
814
+
815
+ All three tasks must complete without errors. Scores must be in [0.0, 1.0].
816
+
817
+ ---
818
+
819
+ ## Step 8 — openenv.yaml
820
+
821
+ This file must pass `openenv validate`. Write it carefully.
822
+
823
+ ```yaml
824
+ # openenv.yaml
825
+ name: Invoice Exception Handler
826
+ version: "1.0.0"
827
+ description: |
828
+ An agent learning environment simulating accounts payable exception handling.
829
+ The agent acts as an AP analyst: investigates flagged invoices, applies business
830
+ rules, detects fraud signals, makes decisions, and closes cases with an audit trail.
831
+
832
+ authors:
833
+ - name: Your Name
834
+ email: your@email.com
835
+
836
+ license: MIT
837
+
838
+ tasks:
839
+ - id: task1_price_variance
840
+ name: Price Variance Exception
841
+ difficulty: easy
842
+ description: |
843
+ Office stationery invoice arrives 3.08% above PO. Company tolerance policy
844
+ allows ±2% auto-approval. Agent must detect the variance, verify through
845
+ the tolerance rule, confirm verbal approval with procurement, and approve
846
+ with a PO amendment request.
847
+ max_steps: 18
848
+ optimal_score: 1.0
849
+ min_passing_score: 0.60
850
+
851
+ - id: task2_duplicate_tax
852
+ name: Duplicate Invoice with Tax Error
853
+ difficulty: medium
854
+ description: |
855
+ Logistics supplier submits INV-2024-891, a duplicate of paid INV-2024-819
856
+ (digit transposition: 891 vs 819). Original invoice had wrong GST rate (15%
857
+ vs correct 18%) — company overpaid ₹3,240. New invoice has correct rate.
858
+ Agent must detect the duplicate, identify the tax error in the original,
859
+ and partially approve only the ₹3,240 tax correction.
860
+ max_steps: 20
861
+ optimal_score: 1.0
862
+ min_passing_score: 0.50
863
+
864
+ - id: task3_compound_fraud
865
+ name: Compound Fraud Signals
866
+ difficulty: hard
867
+ description: |
868
+ IT equipment supplier invoice with four simultaneous fraud signals: bank
869
+ account changed via BEC attack (lookalike email domain), GSTIN belongs to
870
+ a different entity, 2 of 15 laptops not yet received, and unit price 8.65%
871
+ above PO. Agent must find all signals, use the correct communication channel
872
+ (phone, not email — which would contact the fraudster), and escalate to legal
873
+ and security.
874
+ max_steps: 25
875
+ optimal_score: 1.0
876
+ min_passing_score: 0.40
877
+
878
+ observation_space:
879
+ type: object
880
+ description: EnvironmentState Pydantic model
881
+ fields:
882
+ task_id: {type: string}
883
+ step_number: {type: integer}
884
+ case_status: {type: string, enum: [open, in_review, decided, routed, closed]}
885
+ purchase_order: {type: object, description: "PO with line items and terms"}
886
+ invoice: {type: object, description: "Supplier invoice with line items and tax"}
887
+ grn: {type: object, description: "Goods receipt — what actually arrived"}
888
+ supplier_master: {type: object, description: "Verified supplier record"}
889
+ exception_flag: {type: object, description: "Why the system flagged this invoice"}
890
+ inspections: {type: array, description: "Fields the agent has inspected"}
891
+ checks_run: {type: array, description: "Validation checks completed"}
892
+ queries: {type: array, description: "Internal and supplier queries"}
893
+ rules_applied: {type: array, description: "Business rules applied"}
894
+ decision: {type: string, nullable: true}
895
+ routed_to: {type: array}
896
+ available_actions: {type: array}
897
+ available_checks: {type: array}
898
+ available_rules: {type: array}
899
+ knowledge_base: {type: array}
900
+ cumulative_reward: {type: number}
901
+
902
+ action_space:
903
+ type: object
904
+ description: Action with type and params
905
+ actions:
906
+ inspect_field:
907
+ params: {document: string, field: string}
908
+ cross_check:
909
+ params: {field: string, doc_a: string, doc_b: string}
910
+ run_check:
911
+ params: {check_name: string}
912
+ query_supplier:
913
+ params: {question: string, channel: string}
914
+ query_internal:
915
+ params: {department: string, question: string}
916
+ apply_rule:
917
+ params: {rule_id: string}
918
+ make_decision:
919
+ params: {decision: string, reason: string}
920
+ route_to:
921
+ params: {team: string, notes: string}
922
+ close_case:
923
+ params: {summary: string}
924
+
925
+ reward:
926
+ range: [-1.0, 1.0]
927
+ description: |
928
+ Shaped reward at every step. Relevant inspections: +0.01 to +0.14.
929
+ Diagnostics revealing issues: +0.08 to +0.18. Correct fixes: +0.08 to +0.30.
930
+ Wrong decision on fraud: -0.15 to -0.40. Repeat actions: -0.02 to -0.05.
931
+ SLA breach: -0.10.
932
+
933
+ grading:
934
+ method: task_grader
935
+ scores:
936
+ - score # 0.0–1.0 overall
937
+ - diagnosis_score
938
+ - investigation_score
939
+ - decision_score
940
+ - routing_score
941
+ - closure_score
942
+ - efficiency_score
943
+
944
+ api:
945
+ reset:
946
+ signature: "reset(task_id: str | None = None) -> EnvironmentState"
947
+ step:
948
+ signature: "step(action: Action | dict) -> StepResult"
949
+ state:
950
+ signature: "state() -> EnvironmentState"
951
+ grade:
952
+ signature: "grade() -> Dict[str, float]"
953
+
954
+ http_endpoints:
955
+ - path: /reset
956
+ method: POST
957
+ description: Reset environment, returns EnvironmentState JSON
958
+ - path: /step
959
+ method: POST
960
+ description: Execute action, returns StepResult JSON
961
+ - path: /state
962
+ method: GET
963
+ description: Current state, returns EnvironmentState JSON
964
+ - path: /grade
965
+ method: POST
966
+ description: Grade current episode
967
+ - path: /health
968
+ method: GET
969
+ description: Health check
970
+
971
+ dependencies:
972
+ python: ">=3.11"
973
+ packages:
974
+ - pydantic==2.7.1
975
+ - fastapi==0.111.0
976
+ - uvicorn==0.29.0
977
+ - gradio==4.36.1
978
+ - openai==1.35.3
979
+ - pyyaml==6.0.1
980
+
981
+ docker:
982
+ port: 7860
983
+ health_check: /health
984
+ ```
985
+
986
+ ---
987
+
988
+ ## Step 9 — inference.py
989
+
990
+ This is the most critical file for the hackathon validator. Get the format exactly right.
991
+
992
+ ### Required env vars
993
+
994
+ ```python
995
+ API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
996
+ MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
997
+ API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY", "")
998
+ ```
999
+
1000
+ ### Required stdout format
1001
+
1002
+ Every line to stdout must be exactly:
1003
+ ```
1004
+ [START] task=<task_id> env=invoice-exception-handler model=<model_name>
1005
+ [STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
1006
+ [END] success=<true|false> steps=<n> score=<0.000> rewards=<r1,r2,...>
1007
+ ```
1008
+
1009
+ Rules (do not deviate):
1010
+ - One `[START]` line at episode begin
1011
+ - One `[STEP]` line per step, immediately after `env.step()` returns
1012
+ - One `[END]` line after the episode, always emitted even on exception
1013
+ - `reward` and all values in `rewards` formatted to exactly 2 decimal places
1014
+ - `score` formatted to exactly 3 decimal places
1015
+ - `done` and `success` are lowercase: `true` or `false`
1016
+ - `error` is the error message string, or exactly `null` if none
1017
+ - No newlines within a single line
1018
+ - `flush=True` on every print so the validator sees output in real time
1019
+
1020
+ ### System prompt for the LLM
1021
+
1022
+ Write a clear system prompt that tells the model:
1023
+ - It is an AP analyst handling a flagged invoice
1024
+ - It has a structured action space (list all 9 action types)
1025
+ - It must respond in JSON: `{"type": "...", "params": {...}}`
1026
+ - It should investigate before deciding
1027
+ - Never approve without checking, never contact supplier by email if fraud is suspected
1028
+ - Available documents: PO, Invoice, GRN, Supplier Master, Exception Flag
1029
+
1030
+ ### User prompt per step
1031
+
1032
+ Include in the user prompt:
1033
+ - Current step number and max steps
1034
+ - The exception flag (what was flagged and why)
1035
+ - Available checks (list them)
1036
+ - Available rules (list them)
1037
+ - Knowledge base entries (the policy list)
1038
+ - What has been done so far (checks run, queries made, inspections done)
1039
+ - Current cumulative reward
1040
+ - Ask for next action as JSON
1041
+
1042
+ ### Parsing LLM output
1043
+
1044
+ ```python
1045
+ def parse_action(raw_text: str) -> dict:
1046
+ """
1047
+ Parse the model's response into an action dict.
1048
+ Handles markdown code fences, extra whitespace, and minor formatting errors.
1049
+ Falls back to run_check(po_match) if parsing fails.
1050
+ """
1051
+ text = raw_text.strip()
1052
+ # Remove ```json or ``` fences if present
1053
+ if text.startswith("```"):
1054
+ lines = text.split("\n")
1055
+ text = "\n".join(lines[1:-1] if lines[-1] == "```" else lines[1:])
1056
+ try:
1057
+ return json.loads(text.strip())
1058
+ except json.JSONDecodeError:
1059
+ # Try to find JSON within the text
1060
+ import re
1061
+ match = re.search(r'\{.*\}', text, re.DOTALL)
1062
+ if match:
1063
+ try:
1064
+ return json.loads(match.group())
1065
+ except json.JSONDecodeError:
1066
+ pass
1067
+ # Safe fallback
1068
+ return {"type": "run_check", "params": {"check_name": "po_match"}}
1069
+ ```
1070
+
1071
+ ### Overall structure
1072
+
1073
+ ```python
1074
+ def run_task(client, env, task_id, max_steps=20):
1075
+ """Run one task episode and return (steps_taken, score, rewards)."""
1076
+ rewards = []
1077
+
1078
+ print(f"[START] task={task_id} env=invoice-exception-handler model={MODEL_NAME}", flush=True)
1079
+
1080
+ obs = env.reset(task_id)
1081
+ history = []
1082
+
1083
+ for step in range(1, max_steps + 1):
1084
+ # Build prompt from observation
1085
+ user_prompt = build_prompt(obs, step, max_steps, history)
1086
+
1087
+ # Call LLM
1088
+ raw = call_llm(client, user_prompt)
1089
+ action_dict = parse_action(raw)
1090
+
1091
+ # Execute
1092
+ try:
1093
+ result = env.step(action_dict)
1094
+ reward = result.reward
1095
+ done = result.done
1096
+ error = None
1097
+ except Exception as e:
1098
+ reward = 0.0
1099
+ done = False
1100
+ error = str(e)
1101
+ result = None
1102
+
1103
+ rewards.append(reward)
1104
+ action_str = json.dumps(action_dict)
1105
+
1106
+ print(
1107
+ f"[STEP] step={step} action={action_str} "
1108
+ f"reward={reward:.2f} done={str(done).lower()} "
1109
+ f"error={error or 'null'}",
1110
+ flush=True
1111
+ )
1112
+
1113
+ history.append(f"Step {step}: {action_str} → reward {reward:+.2f}")
1114
+
1115
+ if result:
1116
+ obs = result.observation
1117
+
1118
+ if done:
1119
+ break
1120
+
1121
+ score = env.grade()["score"]
1122
+ success = score >= 0.5
1123
+ steps_taken = min(step, max_steps)
1124
+ rewards_str = ",".join(f"{r:.2f}" for r in rewards)
1125
+
1126
+ print(
1127
+ f"[END] success={str(success).lower()} steps={steps_taken} "
1128
+ f"score={score:.3f} rewards={rewards_str}",
1129
+ flush=True
1130
+ )
1131
+
1132
+ return steps_taken, score, rewards
1133
+
1134
+
1135
+ def main():
1136
+ client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
1137
+ env = InvoiceExceptionEnv(seed=42)
1138
+
1139
+ for task_id in ALL_TASKS:
1140
+ run_task(client, env, task_id)
1141
+
1142
+
1143
+ if __name__ == "__main__":
1144
+ main()
1145
+ ```
1146
+
1147
+ ---
1148
+
1149
+ ## Step 10 — app.py
1150
+
1151
+ The app.py serves two purposes:
1152
+ 1. Provides the FastAPI HTTP endpoints that the validator pings (`POST /reset` must return 200)
1153
+ 2. Provides a Gradio UI for interactive exploration on HF Spaces
1154
+
1155
+ ### Architecture
1156
+
1157
+ Run both FastAPI and Gradio in the same process on port 7860.
1158
+ Use `gr.mount_gradio_app` to mount Gradio on FastAPI, or run Gradio alongside FastAPI.
1159
+
1160
+ The cleanest approach:
1161
+
1162
+ ```python
1163
+ import gradio as gr
1164
+ from fastapi import FastAPI
1165
+ from fastapi.responses import JSONResponse
1166
+ import uvicorn
1167
+
1168
+ app = FastAPI(title="Invoice Exception Handler OpenEnv")
1169
+ env = InvoiceExceptionEnv(seed=42) # shared environment instance
1170
+
1171
+ @app.post("/reset")
1172
+ async def http_reset(body: dict = {}):
1173
+ task_id = body.get("task_id", None)
1174
+ obs = env.reset(task_id)
1175
+ return JSONResponse(obs.model_dump(mode="json"))
1176
+
1177
+ @app.post("/step")
1178
+ async def http_step(body: dict):
1179
+ result = env.step(body)
1180
+ return JSONResponse(result.model_dump(mode="json"))
1181
+
1182
+ @app.get("/state")
1183
+ async def http_state():
1184
+ return JSONResponse(env.state().model_dump(mode="json"))
1185
+
1186
+ @app.post("/grade")
1187
+ async def http_grade():
1188
+ return JSONResponse(env.grade())
1189
+
1190
+ @app.get("/tasks")
1191
+ async def http_tasks():
1192
+ return JSONResponse(ALL_TASKS)
1193
+
1194
+ @app.get("/health")
1195
+ async def health():
1196
+ return JSONResponse({"status": "ok", "version": "1.0.0"})
1197
+
1198
+ # Mount Gradio on /ui
1199
+ gradio_app = build_gradio_ui()
1200
+ app = gr.mount_gradio_app(app, gradio_app, path="/")
1201
+ ```
1202
+
1203
+ ### Gradio UI — what to build
1204
+
1205
+ Keep the UI simple and functional. Three tabs:
1206
+
1207
+ **Tab 1: Manual Play**
1208
+ - Dropdown to select task (labels: "Task 1 — Price Variance (Easy)", etc.)
1209
+ - Reset button
1210
+ - Shows the exception flag, the key document fields, and available actions
1211
+ - Dropdown or textbox to compose and submit an action
1212
+ - Shows reward, cumulative reward, and status after each step
1213
+ - Shows grade breakdown when episode ends
1214
+
1215
+ **Tab 2: Agent Demo**
1216
+ - Select task
1217
+ - Shows a hardcoded optimal action sequence running step by step
1218
+ - Good for demonstrating the environment to judges who won't run code
1219
+
1220
+ **Tab 3: API Reference**
1221
+ - Code examples for each action type
1222
+ - Reward table
1223
+ - Grader score breakdown explanation
1224
+
1225
+ ---
1226
+
1227
+ ## Step 11 — Dockerfile
1228
+
1229
+ ```dockerfile
1230
+ FROM python:3.11-slim
1231
+
1232
+ # Install system dependencies
1233
+ RUN apt-get update \
1234
+ && apt-get install -y --no-install-recommends curl \
1235
+ && rm -rf /var/lib/apt/lists/*
1236
+
1237
+ # Create non-root user (required by HF Spaces)
1238
+ RUN useradd -m -u 1000 appuser
1239
+
1240
+ WORKDIR /app
1241
+
1242
+ # Copy and install dependencies first (layer caching)
1243
+ COPY requirements.txt .
1244
+ RUN pip install --no-cache-dir -r requirements.txt
1245
+
1246
+ # Copy application code
1247
+ COPY --chown=appuser:appuser . .
1248
+
1249
+ USER appuser
1250
+
1251
+ EXPOSE 7860
1252
+
1253
+ # Health check — pings the /health endpoint
1254
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=20s --retries=3 \
1255
+ CMD curl -f http://localhost:7860/health || exit 1
1256
+
1257
+ ENV PYTHONUNBUFFERED=1
1258
+ ENV GRADIO_SERVER_NAME=0.0.0.0
1259
+ ENV GRADIO_SERVER_PORT=7860
1260
+
1261
+ CMD ["python", "app.py"]
1262
+ ```
1263
+
1264
+ ---
1265
+
1266
+ ## Step 12 — End-to-End Test Checklist
1267
+
1268
+ Before pushing, check every item in this list.
1269
+
1270
+ ```bash
1271
+ # 1. Imports work
1272
+ python -c "from env import InvoiceExceptionEnv, Action, ALL_TASKS; print('OK')"
1273
+
1274
+ # 2. All three tasks complete without errors
1275
+ python -c "
1276
+ from env import InvoiceExceptionEnv, Action, ALL_TASKS
1277
+ env = InvoiceExceptionEnv(seed=42)
1278
+ for t in ALL_TASKS:
1279
+ obs = env.reset(t)
1280
+ result = env.step(Action.run_check(obs.available_checks[0]))
1281
+ result = env.step(Action.make_decision('reject', 'test'))
1282
+ result = env.step(Action.close_case('test'))
1283
+ score = env.grade()['score']
1284
+ assert 0.0 <= score <= 1.0, f'Score out of range: {score}'
1285
+ print(f'{t}: {score}')
1286
+ print('All tasks OK')
1287
+ "
1288
+
1289
+ # 3. Graders are deterministic
1290
+ python -c "
1291
+ from env import InvoiceExceptionEnv, Action
1292
+ env1 = InvoiceExceptionEnv(seed=42)
1293
+ env2 = InvoiceExceptionEnv(seed=42)
1294
+ obs1 = env1.reset('task1_price_variance')
1295
+ obs2 = env2.reset('task1_price_variance')
1296
+ env1.step(Action.run_check('tolerance_rule'))
1297
+ env2.step(Action.run_check('tolerance_rule'))
1298
+ env1.step(Action.make_decision('approve', 'test'))
1299
+ env2.step(Action.make_decision('approve', 'test'))
1300
+ env1.step(Action.close_case('done'))
1301
+ env2.step(Action.close_case('done'))
1302
+ s1 = env1.grade()['score']
1303
+ s2 = env2.grade()['score']
1304
+ assert s1 == s2, f'Non-deterministic: {s1} vs {s2}'
1305
+ print(f'Deterministic: {s1}')
1306
+ "
1307
+
1308
+ # 4. inference.py log format (with fake API key)
1309
+ API_BASE_URL=https://api.example.com HF_TOKEN=fake MODEL_NAME=test python -c "
1310
+ # This will fail on the API call but should print [START] before failing
1311
+ import subprocess, sys
1312
+ "
1313
+ # Manually verify the [START] line would print correctly
1314
+
1315
+ # 5. Docker builds
1316
+ docker build -t invoice-env-test .
1317
+
1318
+ # 6. Docker runs and /health returns 200
1319
+ docker run -d -p 7860:7860 --name test-env invoice-env-test
1320
+ sleep 15
1321
+ curl -f http://localhost:7860/health
1322
+ curl -s -X POST http://localhost:7860/reset -H "Content-Type: application/json" -d '{}'
1323
+ docker stop test-env && docker rm test-env
1324
+
1325
+ # 7. openenv validate (if openenv-core is installed)
1326
+ pip install openenv-core
1327
+ openenv validate
1328
+ ```
1329
+
1330
+ ---
1331
+
1332
+ ## Step 13 — documents/ Folder
1333
+
1334
+ Create these four files. Keep them updated as the project evolves.
1335
+
1336
+ ### documents/CHANGELOG.md
1337
+
1338
+ ```markdown
1339
+ # Changelog
1340
+
1341
+ All changes to the Invoice Exception Handler environment are recorded here.
1342
+ Format: Date | Version | What changed | Why
1343
+
1344
+ ---
1345
+
1346
+ ## [1.0.0] — 2025-01-20
1347
+
1348
+ ### Added
1349
+ - Initial implementation of InvoiceExceptionEnv with full OpenEnv API
1350
+ - Three tasks: task1_price_variance, task2_duplicate_tax, task3_compound_fraud
1351
+ - Pydantic v2 typed models for all environment objects
1352
+ - FastAPI HTTP endpoints for HF Spaces validation
1353
+ - Gradio UI for interactive exploration
1354
+ - inference.py using OpenAI client with [START][STEP][END] log format
1355
+ - openenv.yaml spec file
1356
+ - Dockerfile for HF Spaces deployment
1357
+
1358
+ ### Design decisions
1359
+ - Used pure Python simulation (no external databases) for portability and determinism
1360
+ - Compound fraud task has four signals to prevent simple greedy agents from scoring well
1361
+ - Channel selection in Task 3 (phone vs email) tests policy knowledge, not just anomaly detection
1362
+ - Grader uses sub-scores to allow partial credit for partial solutions
1363
+ ```
1364
+
1365
+ ### documents/ARCHITECTURE.md
1366
+
1367
+ Document the system architecture. Include:
1368
+ - A text diagram of how the components connect
1369
+ - Why FastAPI and Gradio in the same process (HF Spaces constraint)
1370
+ - Why Pydantic v2 (spec requirement, validation)
1371
+ - How EpisodeData separates mutable state from immutable document context
1372
+ - Why tasks are separate classes (easy to extend)
1373
+
1374
+ ### documents/BASELINE-SCORES.md
1375
+
1376
+ Record the reproducible baseline scores. Run them yourself and copy the output here.
1377
+
1378
+ ```markdown
1379
+ # Baseline Scores
1380
+
1381
+ Recorded on: 2025-01-20
1382
+ Seed: 42
1383
+ Machine: 2 vCPU, 8GB RAM
1384
+
1385
+ ## Random Agent (action_space_sample())
1386
+
1387
+ | Task | Score | Steps |
1388
+ |------|-------|-------|
1389
+ | task1_price_variance | ~0.18 | 18 (SLA breach) |
1390
+ | task2_duplicate_tax | ~0.12 | 20 (SLA breach) |
1391
+ | task3_compound_fraud | ~0.08 | 25 (SLA breach) |
1392
+ | **Average** | **~0.13** | |
1393
+
1394
+ ## Optimal Agent (hardcoded correct actions)
1395
+
1396
+ | Task | Score | Steps |
1397
+ |------|-------|-------|
1398
+ | task1_price_variance | ~0.98 | 9 |
1399
+ | task2_duplicate_tax | ~0.95 | 10 |
1400
+ | task3_compound_fraud | ~0.92 | 14 |
1401
+ | **Average** | **~0.95** | |
1402
+ ```
1403
+
1404
+ ---
1405
+
1406
+ ## Step 14 — Push and Verify
1407
+
1408
+ ```bash
1409
+ # Final commit
1410
+ git add .
1411
+ git commit -m "feat: complete invoice exception handler v1.0.0
1412
+
1413
+ - 3 tasks with deterministic graders (easy/medium/hard)
1414
+ - Full OpenEnv API: reset/step/state/grade
1415
+ - FastAPI HTTP endpoints for validator (/reset, /step, /state, /health)
1416
+ - Gradio UI for HF Spaces
1417
+ - inference.py with OpenAI client and [START][STEP][END] format
1418
+ - openenv.yaml spec
1419
+ - Dockerfile for HF Spaces deployment
1420
+ - documents/ folder with PRD, changelog, architecture, baseline scores"
1421
+
1422
+ git push origin main
1423
+
1424
+ # Deploy to HF Spaces (if not using git-based deployment)
1425
+ # The Dockerfile and app.py handle this automatically when pushed to HF
1426
+ ```
1427
+
1428
+ ---
1429
+
1430
+ ## Action Space Reference
1431
+
1432
+ | Action Type | Required Params | Description |
1433
+ |---|---|---|
1434
+ | `inspect_field` | `document, field` | Look at a specific field in a document |
1435
+ | `cross_check` | `field, doc_a, doc_b` | Compare a field between two documents |
1436
+ | `run_check` | `check_name` | Run a named validation check |
1437
+ | `query_supplier` | `question, channel` | Ask the supplier something (channel: phone or email) |
1438
+ | `query_internal` | `department, question` | Ask an internal team |
1439
+ | `apply_rule` | `rule_id` | Apply a business policy rule |
1440
+ | `make_decision` | `decision, reason` | approve / reject / hold / partial_approve |
1441
+ | `route_to` | `team, notes` | Escalate to a team |
1442
+ | `close_case` | `summary` | Close with an audit trail summary |
1443
+
1444
+ ---
1445
+
1446
+ ## Observation Space Reference
1447
+
1448
+ | Field | Type | Description |
1449
+ |---|---|---|
1450
+ | `task_id` | str | Which task is running |
1451
+ | `step_number` | int | Current step |
1452
+ | `case_status` | str | open / in_review / decided / routed / closed |
1453
+ | `purchase_order` | PurchaseOrder | What was agreed to be purchased |
1454
+ | `invoice` | Invoice | What the supplier is claiming |
1455
+ | `grn` | GoodsReceiptNote | What actually arrived |
1456
+ | `supplier_master` | SupplierMaster | Verified supplier record |
1457
+ | `exception_flag` | ExceptionFlag | Why this invoice was flagged |
1458
+ | `inspections` | List | Fields already inspected |
1459
+ | `checks_run` | List | Validation checks already run |
1460
+ | `queries` | List | Queries made and responses |
1461
+ | `rules_applied` | List | Business rules applied |
1462
+ | `decision` | str? | Current decision if made |
1463
+ | `routed_to` | List | Teams this case has been escalated to |
1464
+ | `available_actions` | List | All 9 action types |
1465
+ | `available_checks` | List | Check names valid for this task |
1466
+ | `available_rules` | List | Rule IDs valid for this task |
1467
+ | `knowledge_base` | List | Policy entries relevant to this task |
1468
+ | `cumulative_reward` | float | Sum of all rewards so far |
1469
+
1470
+ ---
1471
+
1472
+ ## Reward Reference
1473
+
1474
+ | Event | Reward |
1475
+ |---|---|
1476
+ | Inspecting a key field that reveals an anomaly | +0.08 to +0.14 |
1477
+ | Inspecting a routine field | +0.01 to +0.06 |
1478
+ | Cross-check that finds a mismatch | +0.12 to +0.15 |
1479
+ | Running a check that finds an issue | +0.08 to +0.18 |
1480
+ | Querying the right person | +0.04 to +0.12 |
1481
+ | Contacting supplier via wrong channel (Task 3) | −0.15 |
1482
+ | Applying the correct business rule | +0.08 to +0.12 |
1483
+ | Applying the wrong rule | −0.05 to −0.10 |
1484
+ | Correct decision (approve/reject/partial) | +0.18 to +0.28 |
1485
+ | Approving a fraudulent invoice | −0.35 to −0.40 |
1486
+ | Wrong rejection (task1) | −0.10 |
1487
+ | Routing to the right team | +0.06 to +0.14 |
1488
+ | Clean case closure | +0.06 to +0.12 |
1489
+ | Repeat action | −0.02 to −0.05 |
1490
+ | SLA breach (exceed max_steps) | −0.10 |
1491
+
1492
+ ---
1493
+
1494
+ ## Expected Baseline Scores
1495
+
1496
+ These are the scores you should see when running `inference.py` with a good LLM.
1497
+
1498
+ | Task | Difficulty | Random Agent | Rule Agent | LLM Agent (Qwen-72B) |
1499
+ |---|---|---|---|---|
1500
+ | task1_price_variance | Easy | ~0.18 | ~0.85 | ~0.80 |
1501
+ | task2_duplicate_tax | Medium | ~0.12 | ~0.72 | ~0.68 |
1502
+ | task3_compound_fraud | Hard | ~0.08 | ~0.55 | ~0.45 |
1503
+
1504
+ The hard task should be genuinely hard for LLMs — a score of 0.45 is expected, not a failure.
1505
+
1506
+ ---
1507
+
1508
+ ## Environment Variables
1509
+
1510
+ | Variable | Required | Default | Description |
1511
+ |---|---|---|---|
1512
+ | `API_BASE_URL` | Yes | `https://router.huggingface.co/v1` | LLM endpoint |
1513
+ | `MODEL_NAME` | Yes | `Qwen/Qwen2.5-72B-Instruct` | Model to use |
1514
+ | `HF_TOKEN` | Yes | — | API key for the LLM endpoint |
1515
+ | `ANTHROPIC_API_KEY` | No | — | Only if using Anthropic models directly |
1516
+
1517
+ ---
1518
+
1519
+ ## Setup Instructions
1520
+
1521
+ ### Local Development
1522
+
1523
+ ```bash
1524
+ # Clone the repo
1525
+ git clone https://github.com/YOUR_USERNAME/invoice-exception-handler.git
1526
+ cd invoice-exception-handler
1527
+
1528
+ # Create virtual environment
1529
+ python -m venv venv
1530
+ source venv/bin/activate # Windows: venv\Scripts\activate
1531
+
1532
+ # Install dependencies
1533
+ pip install -r requirements.txt
1534
+
1535
+ # Run the app locally
1536
+ python app.py
1537
+ # Visit http://localhost:7860
1538
+ ```
1539
+
1540
+ ### Run Inference
1541
+
1542
+ ```bash
1543
+ export API_BASE_URL="https://router.huggingface.co/v1"
1544
+ export MODEL_NAME="Qwen/Qwen2.5-72B-Instruct"
1545
+ export HF_TOKEN="your-token-here"
1546
+
1547
+ python inference.py
1548
+ ```
1549
+
1550
+ ### Docker
1551
+
1552
+ ```bash
1553
+ docker build -t invoice-exception-handler .
1554
+ docker run -p 7860:7860 \
1555
+ -e API_BASE_URL="https://router.huggingface.co/v1" \
1556
+ -e MODEL_NAME="Qwen/Qwen2.5-72B-Instruct" \
1557
+ -e HF_TOKEN="your-token-here" \
1558
+ invoice-exception-handler
1559
+ ```
1560
+
1561
+ ### HF Spaces Deployment
1562
+
1563
+ 1. Create a new Space with the Gradio SDK
1564
+ 2. Push this repository to it
1565
+ 3. Add secrets in Space settings: `API_BASE_URL`, `MODEL_NAME`, `HF_TOKEN`
1566
+ 4. The Space will build and deploy automatically from the Dockerfile
1567
+
1568
+ ### Validate Submission
1569
+
1570
+ ```bash
1571
+ # Install validator
1572
+ pip install openenv-core
1573
+
1574
+ # Validate the spec
1575
+ openenv validate
1576
+
1577
+ # Run the full submission validator script
1578
+ chmod +x scripts/validate-submission.sh
1579
+ ./scripts/validate-submission.sh https://your-space.hf.space .
1580
+ ```
1581
+
1582
+ ---
1583
+
1584
+ ## Common Mistakes to Avoid
1585
+
1586
+ 1. **Don't use `inference.py` as the wrong name.** The validator looks for exactly `inference.py` in the root.
1587
+
1588
+ 2. **Don't use the Anthropic SDK in inference.py.** The spec requires the OpenAI client. Use `from openai import OpenAI`.
1589
+
1590
+ 3. **Don't forget `flush=True` on print statements.** The validator reads stdout line by line. Without flush, logs may not appear.
1591
+
1592
+ 4. **Don't let the Gradio UI crash the FastAPI server.** If the UI has an error, it should fail gracefully, not bring down `/reset`.
1593
+
1594
+ 5. **Don't hardcode the model name.** Always read from `os.getenv("MODEL_NAME")`.
1595
+
1596
+ 6. **Don't put business logic in models.py.** That file is just data shapes.
1597
+
1598
+ 7. **Don't mutate documents during a step.** The documents (PO, Invoice, GRN) are fixed for the duration of an episode. Only EpisodeData changes.
1599
+
1600
+ 8. **Don't forget to test determinism.** Same seed + same actions must = same score. Run the determinism test.
1601
+
1602
+ 9. **Don't skip the docker build test.** The validator builds your Docker image. If it doesn't build, you're disqualified.
1603
+
1604
+ 10. **Don't forget the changelog.** Update `documents/CHANGELOG.md` before every push.
1605
+
1606
+ ---
1607
+
1608
+ ## License
1609
+
1610
+ MIT License. See LICENSE file.
env/__init__.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Invoice Exception Handler — OpenEnv environment package.
3
+
4
+ Import the main environment class and supporting types from here:
5
+ from env import InvoiceExceptionEnv, Action, ALL_TASKS
6
+ """
7
+ from .environment import InvoiceExceptionEnv
8
+ from .models import Action, ActionType, EnvironmentState, StepResult
9
+ from .tasks import ALL_TASKS, make_task
10
+
11
+ __all__ = [
12
+ "InvoiceExceptionEnv",
13
+ "Action",
14
+ "ActionType",
15
+ "EnvironmentState",
16
+ "StepResult",
17
+ "ALL_TASKS",
18
+ "make_task",
19
+ ]
env/environment.py ADDED
@@ -0,0 +1,339 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ InvoiceExceptionEnv — the main environment class.
3
+
4
+ This is the only class external code needs to import. It wraps the task
5
+ registry, dispatches actions, manages episode state, and provides the
6
+ OpenEnv-compatible API: reset(), step(), state(), grade().
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import random
11
+ from typing import Any, Dict, List, Optional, Union
12
+
13
+ from .models import (
14
+ Action, ActionType, CaseStatus, EnvironmentState, StepResult,
15
+ )
16
+ from .tasks import ALL_TASKS, BaseTask, EpisodeData, make_task
17
+
18
+
19
+ class InvoiceExceptionEnv:
20
+ """
21
+ OpenEnv-compatible Invoice Exception Handler environment.
22
+
23
+ Usage:
24
+ env = InvoiceExceptionEnv(seed=42)
25
+ obs = env.reset("task1_price_variance")
26
+ result = env.step(Action.run_check("tolerance_rule"))
27
+ scores = env.grade()
28
+ """
29
+
30
+ def __init__(self, seed: Optional[int] = None) -> None:
31
+ """Initialise with an optional seed for reproducibility."""
32
+ self._rng = random.Random(seed)
33
+ self._task: Optional[BaseTask] = None
34
+ self._ep: Optional[EpisodeData] = None
35
+ self._state_cache: Optional[EnvironmentState] = None
36
+ self._done: bool = False
37
+
38
+ # ------------------------------------------------------------------
39
+ # Public API
40
+ # ------------------------------------------------------------------
41
+
42
+ def reset(self, task_id: Optional[str] = None) -> EnvironmentState:
43
+ """
44
+ Start a new episode. If task_id is None, picks one at random.
45
+ Returns the initial EnvironmentState showing all documents and
46
+ available actions.
47
+ """
48
+ if task_id is None:
49
+ task_id = self._rng.choice(ALL_TASKS)
50
+
51
+ self._task = make_task(task_id)
52
+ self._ep = EpisodeData()
53
+ self._done = False
54
+ self._state_cache = self._build_state()
55
+ return self._state_cache
56
+
57
+ def step(self, action: Union[Action, Dict[str, Any]]) -> StepResult:
58
+ """
59
+ Execute one action. Returns observation, reward, done flag, and
60
+ info dict. Raises RuntimeError if called before reset() or after
61
+ the episode is done.
62
+ """
63
+ if self._task is None or self._ep is None:
64
+ raise RuntimeError("Call reset() before step().")
65
+ if self._done:
66
+ raise RuntimeError("Episode is done. Call reset() to start a new one.")
67
+
68
+ # Convert dict to Action if needed
69
+ if isinstance(action, dict):
70
+ action = Action(
71
+ type=ActionType(action.get("type", action.get("action_type", ""))),
72
+ params=action.get("params", {}),
73
+ )
74
+
75
+ # Dispatch the action
76
+ reward, info = self._dispatch(action)
77
+
78
+ # Update episode
79
+ self._ep.step_count += 1
80
+ self._ep.cumulative_reward += reward
81
+
82
+ # Check SLA breach
83
+ sla_penalty = 0.0
84
+ if self._ep.step_count >= self._task.max_steps:
85
+ sla_penalty = -0.10
86
+ self._done = True
87
+ info["sla_breach"] = True
88
+
89
+ # Check done conditions
90
+ if self._ep.closed:
91
+ self._done = True
92
+
93
+ total_reward = reward + sla_penalty
94
+ self._ep.cumulative_reward += sla_penalty # add SLA penalty separately
95
+
96
+ # Rebuild state
97
+ self._state_cache = self._build_state()
98
+
99
+ return StepResult(
100
+ observation=self._state_cache,
101
+ reward=round(total_reward, 4),
102
+ done=self._done,
103
+ info=info,
104
+ )
105
+
106
+ def state(self) -> EnvironmentState:
107
+ """Return the current state without advancing the episode."""
108
+ if self._state_cache is None:
109
+ raise RuntimeError("Call reset() before state().")
110
+ return self._state_cache
111
+
112
+ def grade(self) -> Dict[str, float]:
113
+ """Run the task grader on the current episode and return scores."""
114
+ if self._task is None or self._ep is None:
115
+ raise RuntimeError("Call reset() before grade().")
116
+ return self._task.grade(self._ep)
117
+
118
+ def action_space_sample(self) -> Action:
119
+ """Return a random valid action for baseline/testing purposes."""
120
+ if self._task is None:
121
+ raise RuntimeError("Call reset() before action_space_sample().")
122
+
123
+ action_type = self._rng.choice(list(ActionType))
124
+
125
+ if action_type == ActionType.INSPECT_FIELD:
126
+ doc = self._rng.choice(["invoice", "po", "grn", "supplier_master"])
127
+ field = self._rng.choice(["line_items", "total_amount", "bank_account",
128
+ "supplier_gstin", "items_received"])
129
+ return Action.inspect_field(doc, field)
130
+
131
+ elif action_type == ActionType.CROSS_CHECK:
132
+ field = self._rng.choice(["unit_price", "total_amount", "bank_account",
133
+ "gstin", "quantity"])
134
+ doc_a = self._rng.choice(["invoice", "po"])
135
+ doc_b = self._rng.choice(["po", "grn", "supplier_master"])
136
+ return Action.cross_check(field, doc_a, doc_b)
137
+
138
+ elif action_type == ActionType.RUN_CHECK:
139
+ check = self._rng.choice(self._task.available_checks)
140
+ return Action.run_check(check)
141
+
142
+ elif action_type == ActionType.QUERY_SUPPLIER:
143
+ channel = self._rng.choice(["email", "phone"])
144
+ return Action.query_supplier("What is the status?", channel)
145
+
146
+ elif action_type == ActionType.QUERY_INTERNAL:
147
+ dept = self._rng.choice(["procurement", "finance", "legal", "security"])
148
+ return Action.query_internal(dept, "Can you provide information?")
149
+
150
+ elif action_type == ActionType.APPLY_RULE:
151
+ rule = self._rng.choice(self._task.available_rules)
152
+ return Action.apply_rule(rule)
153
+
154
+ elif action_type == ActionType.MAKE_DECISION:
155
+ decision = self._rng.choice(["approve", "reject", "hold", "partial_approve"])
156
+ return Action.make_decision(decision, "Random baseline decision.")
157
+
158
+ elif action_type == ActionType.ROUTE_TO:
159
+ team = self._rng.choice(["procurement", "finance", "legal", "security"])
160
+ return Action.route_to(team, "Random baseline routing.")
161
+
162
+ elif action_type == ActionType.CLOSE_CASE:
163
+ return Action.close_case("Random baseline closure.")
164
+
165
+ # Fallback
166
+ return Action.run_check(self._task.available_checks[0])
167
+
168
+ # ------------------------------------------------------------------
169
+ # Internal methods
170
+ # ------------------------------------------------------------------
171
+
172
+ def _dispatch(self, action: Action) -> tuple:
173
+ """
174
+ Route an action to the appropriate task simulator.
175
+ Returns (reward, info dict). Handles repeat-action penalties.
176
+ """
177
+ params = action.params
178
+ info: Dict[str, Any] = {"action_type": action.type.value}
179
+
180
+ if action.type == ActionType.INSPECT_FIELD:
181
+ doc = params.get("document", "")
182
+ field = params.get("field", "")
183
+
184
+ # Repeat penalty
185
+ if self._ep.has_inspected(doc, field):
186
+ info["repeat"] = True
187
+ return -0.02, info
188
+
189
+ result, reward = self._task.simulate_inspect(doc, field)
190
+ self._ep.inspections.append(result)
191
+ info["result"] = result.model_dump()
192
+ return reward, info
193
+
194
+ elif action.type == ActionType.CROSS_CHECK:
195
+ field = params.get("field", "")
196
+ doc_a = params.get("doc_a", "")
197
+ doc_b = params.get("doc_b", "")
198
+
199
+ check_key = f"cross_{field}_{doc_a}_{doc_b}"
200
+ if self._ep.has_checked(check_key):
201
+ info["repeat"] = True
202
+ return -0.03, info
203
+
204
+ result, reward = self._task.simulate_cross_check(field, doc_a, doc_b)
205
+ self._ep.checks.append(result)
206
+ info["result"] = result.model_dump()
207
+ return reward, info
208
+
209
+ elif action.type == ActionType.RUN_CHECK:
210
+ check_name = params.get("check_name", "")
211
+
212
+ if self._ep.has_checked(check_name):
213
+ info["repeat"] = True
214
+ return -0.03, info
215
+
216
+ result, reward = self._task.simulate_run_check(check_name)
217
+ self._ep.checks.append(result)
218
+ info["result"] = result.model_dump()
219
+ return reward, info
220
+
221
+ elif action.type == ActionType.QUERY_SUPPLIER:
222
+ question = params.get("question", "")
223
+ channel = params.get("channel", "email")
224
+
225
+ if self._ep.has_queried("supplier"):
226
+ info["repeat"] = True
227
+ return -0.05, info
228
+
229
+ result, reward = self._task.simulate_query_supplier(question, channel)
230
+ self._ep.queries.append(result)
231
+ info["result"] = result.model_dump()
232
+ return reward, info
233
+
234
+ elif action.type == ActionType.QUERY_INTERNAL:
235
+ department = params.get("department", "")
236
+ question = params.get("question", "")
237
+
238
+ if self._ep.has_queried(department.lower()):
239
+ info["repeat"] = True
240
+ return -0.03, info
241
+
242
+ result, reward = self._task.simulate_query_internal(department, question)
243
+ self._ep.queries.append(result)
244
+ info["result"] = result.model_dump()
245
+ return reward, info
246
+
247
+ elif action.type == ActionType.APPLY_RULE:
248
+ rule_id = params.get("rule_id", "")
249
+
250
+ if rule_id in self._ep.rules_applied:
251
+ info["repeat"] = True
252
+ return -0.03, info
253
+
254
+ detail, reward = self._task.simulate_apply_rule(rule_id)
255
+ self._ep.rules_applied.append(rule_id)
256
+ info["detail"] = detail
257
+ return reward, info
258
+
259
+ elif action.type == ActionType.MAKE_DECISION:
260
+ decision = params.get("decision", "")
261
+ reason = params.get("reason", "")
262
+
263
+ if self._ep.decision is not None:
264
+ info["repeat"] = True
265
+ return -0.05, info
266
+
267
+ reward = self._task.simulate_make_decision(decision, reason, self._ep)
268
+ self._ep.decision = decision
269
+ self._ep.decision_reason = reason
270
+ info["decision"] = decision
271
+ return reward, info
272
+
273
+ elif action.type == ActionType.ROUTE_TO:
274
+ team = params.get("team", "")
275
+ notes = params.get("notes", "")
276
+
277
+ if team.lower() in self._ep.routed_to:
278
+ info["repeat"] = True
279
+ return -0.02, info
280
+
281
+ reward = self._task.simulate_route_to(team, notes, self._ep)
282
+ self._ep.routed_to.append(team.lower())
283
+ info["routed_to"] = team
284
+ return reward, info
285
+
286
+ elif action.type == ActionType.CLOSE_CASE:
287
+ summary = params.get("summary", "")
288
+
289
+ if self._ep.closed:
290
+ info["repeat"] = True
291
+ return -0.05, info
292
+
293
+ reward = self._task.simulate_close(summary, self._ep)
294
+ self._ep.closed = True
295
+ self._ep.close_summary = summary
296
+ info["closed"] = True
297
+ return reward, info
298
+
299
+ # Unknown action type
300
+ return 0.0, {"error": f"Unknown action type: {action.type}"}
301
+
302
+ def _build_state(self) -> EnvironmentState:
303
+ """Construct an EnvironmentState from current task and episode data."""
304
+ # Determine case status
305
+ if self._ep.closed:
306
+ status = CaseStatus.CLOSED
307
+ elif self._ep.routed_to:
308
+ status = CaseStatus.ROUTED
309
+ elif self._ep.decision is not None:
310
+ status = CaseStatus.DECIDED
311
+ elif self._ep.step_count > 0:
312
+ status = CaseStatus.IN_REVIEW
313
+ else:
314
+ status = CaseStatus.OPEN
315
+
316
+ return EnvironmentState(
317
+ task_id=self._task.task_id,
318
+ step_number=self._ep.step_count,
319
+ case_status=status,
320
+ purchase_order=self._task.get_purchase_order(),
321
+ invoice=self._task.get_invoice(),
322
+ grn=self._task.get_grn(),
323
+ supplier_master=self._task.get_supplier_master(),
324
+ exception_flag=self._task.get_exception_flag(),
325
+ inspections=list(self._ep.inspections),
326
+ checks_run=list(self._ep.checks),
327
+ queries=list(self._ep.queries),
328
+ rules_applied=list(self._ep.rules_applied),
329
+ decision=self._ep.decision,
330
+ decision_reason=self._ep.decision_reason,
331
+ routed_to=list(self._ep.routed_to),
332
+ case_closed=self._ep.closed,
333
+ close_summary=self._ep.close_summary,
334
+ available_actions=[at.value for at in ActionType],
335
+ available_checks=self._task.available_checks,
336
+ available_rules=self._task.available_rules,
337
+ knowledge_base=self._task.knowledge_base,
338
+ cumulative_reward=round(self._ep.cumulative_reward, 4),
339
+ )
env/models.py ADDED
@@ -0,0 +1,276 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Typed models for the Invoice Exception Handler OpenEnv environment.
3
+
4
+ Every object the agent sees or produces is defined here as a Pydantic model.
5
+ This is the single source of truth for the data contract between the
6
+ environment simulation and the agent.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import time
11
+ from enum import Enum
12
+ from typing import Any, Dict, List, Optional
13
+
14
+ from pydantic import BaseModel, Field
15
+
16
+
17
+ # ---------------------------------------------------------------------------
18
+ # Enumerations
19
+ # ---------------------------------------------------------------------------
20
+
21
+ class ActionType(str, Enum):
22
+ """The nine action types an agent can take during an episode."""
23
+ INSPECT_FIELD = "inspect_field"
24
+ CROSS_CHECK = "cross_check"
25
+ RUN_CHECK = "run_check"
26
+ QUERY_SUPPLIER = "query_supplier"
27
+ QUERY_INTERNAL = "query_internal"
28
+ APPLY_RULE = "apply_rule"
29
+ MAKE_DECISION = "make_decision"
30
+ ROUTE_TO = "route_to"
31
+ CLOSE_CASE = "close_case"
32
+
33
+
34
+ class DecisionType(str, Enum):
35
+ """Possible decisions the agent can make on a flagged invoice."""
36
+ APPROVE = "approve"
37
+ REJECT = "reject"
38
+ HOLD = "hold"
39
+ PARTIAL_APPROVE = "partial_approve"
40
+
41
+
42
+ class CaseStatus(str, Enum):
43
+ """Lifecycle status of an invoice exception case."""
44
+ OPEN = "open"
45
+ IN_REVIEW = "in_review"
46
+ DECIDED = "decided"
47
+ ROUTED = "routed"
48
+ CLOSED = "closed"
49
+
50
+
51
+ # ---------------------------------------------------------------------------
52
+ # Document models — read-only context given to the agent
53
+ # ---------------------------------------------------------------------------
54
+
55
+ class LineItem(BaseModel):
56
+ """One line on an invoice or purchase order."""
57
+ description: str = Field(..., description="Item description")
58
+ quantity: int = Field(..., description="Number of units")
59
+ unit_price: float = Field(..., description="Price per unit in INR")
60
+ total: float = Field(..., description="Line total in INR (quantity × unit_price)")
61
+ tax_rate: Optional[float] = Field(None, description="Tax rate as a percentage, if applicable")
62
+
63
+
64
+ class PurchaseOrder(BaseModel):
65
+ """What was agreed to be purchased."""
66
+ po_number: str = Field(..., description="Unique PO identifier")
67
+ vendor_name: str = Field(..., description="Supplier name on the PO")
68
+ po_date: str = Field(..., description="Date the PO was raised (YYYY-MM-DD)")
69
+ line_items: List[LineItem] = Field(default_factory=list, description="Items on the PO")
70
+ total_amount: float = Field(..., description="Total PO value in INR")
71
+ payment_terms: str = Field("Net-30", description="Payment terms")
72
+ currency: str = Field("INR", description="Currency code")
73
+
74
+
75
+ class Invoice(BaseModel):
76
+ """What the supplier is claiming — the document under exception review."""
77
+ invoice_number: str = Field(..., description="Unique invoice identifier")
78
+ supplier_name: str = Field(..., description="Supplier name on the invoice")
79
+ invoice_date: str = Field(..., description="Date of the invoice (YYYY-MM-DD)")
80
+ due_date: str = Field(..., description="Payment due date (YYYY-MM-DD)")
81
+ po_reference: str = Field(..., description="PO number referenced by this invoice")
82
+ line_items: List[LineItem] = Field(default_factory=list, description="Items invoiced")
83
+ subtotal: float = Field(..., description="Pre-tax total in INR")
84
+ tax_amount: float = Field(..., description="Total tax amount in INR")
85
+ tax_rate: float = Field(..., description="Applied tax rate as a percentage")
86
+ total_amount: float = Field(..., description="Grand total including tax in INR")
87
+ bank_account: str = Field(..., description="Supplier bank account on the invoice")
88
+ bank_name: str = Field("", description="Bank name")
89
+ ifsc_code: str = Field("", description="IFSC / routing code")
90
+ supplier_gstin: str = Field("", description="GST Identification Number on the invoice")
91
+ supplier_email: str = Field("", description="Email address on the invoice")
92
+ currency: str = Field("INR", description="Currency code")
93
+
94
+
95
+ class GoodsReceiptNote(BaseModel):
96
+ """What actually arrived at the warehouse (or service confirmation)."""
97
+ grn_number: str = Field(..., description="Unique GRN identifier")
98
+ po_reference: str = Field(..., description="PO number this receipt is against")
99
+ receipt_date: str = Field(..., description="Date goods/services were received (YYYY-MM-DD)")
100
+ items_received: List[Dict[str, Any]] = Field(
101
+ default_factory=list,
102
+ description="List of received item dicts with description, quantity_received, quantity_pending, quantity_rejected"
103
+ )
104
+ receiving_officer: str = Field("", description="Person who signed the receipt")
105
+ notes: str = Field("", description="Any delivery notes or discrepancies observed")
106
+
107
+
108
+ class SupplierMaster(BaseModel):
109
+ """The verified, registered supplier record in the company's ERP system."""
110
+ supplier_id: str = Field(..., description="Internal supplier code")
111
+ supplier_name: str = Field(..., description="Registered legal name")
112
+ registered_address: str = Field("", description="Registered business address")
113
+ gstin: str = Field(..., description="Verified GST Identification Number")
114
+ bank_account: str = Field(..., description="Verified bank account number")
115
+ bank_name: str = Field("", description="Bank name")
116
+ ifsc_code: str = Field("", description="Verified IFSC / routing code")
117
+ contact_email: str = Field("", description="Registered email address")
118
+ contact_phone: str = Field("", description="Registered phone number")
119
+ registered_domain: str = Field("", description="Verified email domain for the supplier")
120
+ pan_number: str = Field("", description="PAN (tax ID)")
121
+ status: str = Field("active", description="Supplier status: active, suspended, blacklisted")
122
+
123
+
124
+ class ExceptionFlag(BaseModel):
125
+ """Why the AP system flagged this invoice for manual review."""
126
+ flag_code: str = Field(..., description="Machine-readable code, e.g. PRICE_MISMATCH")
127
+ flag_description: str = Field(..., description="Human-readable explanation of the flag")
128
+ auto_hold: bool = Field(False, description="Whether the system placed an automatic payment hold")
129
+ flagged_date: str = Field("", description="Date the flag was raised (YYYY-MM-DD)")
130
+ severity: str = Field("medium", description="low / medium / high / critical")
131
+
132
+
133
+ # ---------------------------------------------------------------------------
134
+ # Action model
135
+ # ---------------------------------------------------------------------------
136
+
137
+ class Action(BaseModel):
138
+ """
139
+ An action the agent wants to take.
140
+
141
+ Use the classmethod constructors for convenience:
142
+ Action.run_check("tolerance_rule")
143
+ Action.make_decision("approve", "reason here")
144
+ """
145
+ type: ActionType = Field(..., description="Which action type to execute")
146
+ params: Dict[str, Any] = Field(default_factory=dict, description="Parameters for the action")
147
+
148
+ # --- Classmethod constructors for each action type ---
149
+
150
+ @classmethod
151
+ def inspect_field(cls, document: str, field: str) -> Action:
152
+ """Look at a specific field in a document."""
153
+ return cls(type=ActionType.INSPECT_FIELD, params={"document": document, "field": field})
154
+
155
+ @classmethod
156
+ def cross_check(cls, field: str, doc_a: str, doc_b: str) -> Action:
157
+ """Compare a field between two documents."""
158
+ return cls(type=ActionType.CROSS_CHECK, params={"field": field, "doc_a": doc_a, "doc_b": doc_b})
159
+
160
+ @classmethod
161
+ def run_check(cls, check_name: str) -> Action:
162
+ """Run a named validation check."""
163
+ return cls(type=ActionType.RUN_CHECK, params={"check_name": check_name})
164
+
165
+ @classmethod
166
+ def query_supplier(cls, question: str, channel: str = "email") -> Action:
167
+ """Ask the supplier a question via a specific channel."""
168
+ return cls(type=ActionType.QUERY_SUPPLIER, params={"question": question, "channel": channel})
169
+
170
+ @classmethod
171
+ def query_internal(cls, department: str, question: str) -> Action:
172
+ """Ask an internal department a question."""
173
+ return cls(type=ActionType.QUERY_INTERNAL, params={"department": department, "question": question})
174
+
175
+ @classmethod
176
+ def apply_rule(cls, rule_id: str) -> Action:
177
+ """Apply a named business policy rule."""
178
+ return cls(type=ActionType.APPLY_RULE, params={"rule_id": rule_id})
179
+
180
+ @classmethod
181
+ def make_decision(cls, decision: str, reason: str) -> Action:
182
+ """Make a case decision with a documented reason."""
183
+ return cls(type=ActionType.MAKE_DECISION, params={"decision": decision, "reason": reason})
184
+
185
+ @classmethod
186
+ def route_to(cls, team: str, notes: str = "") -> Action:
187
+ """Escalate the case to a specific team."""
188
+ return cls(type=ActionType.ROUTE_TO, params={"team": team, "notes": notes})
189
+
190
+ @classmethod
191
+ def close_case(cls, summary: str) -> Action:
192
+ """Close the case with an audit trail summary."""
193
+ return cls(type=ActionType.CLOSE_CASE, params={"summary": summary})
194
+
195
+
196
+ # ---------------------------------------------------------------------------
197
+ # Result models — returned by simulators
198
+ # ---------------------------------------------------------------------------
199
+
200
+ class InspectionResult(BaseModel):
201
+ """What came back from inspecting a specific field in a document."""
202
+ document: str = Field(..., description="Which document was inspected")
203
+ field: str = Field(..., description="Which field was inspected")
204
+ value: Any = Field(..., description="The value found in that field")
205
+ note: str = Field("", description="Any contextual note about the value")
206
+ timestamp: float = Field(default_factory=time.time, description="When the inspection happened")
207
+
208
+
209
+ class CheckResult(BaseModel):
210
+ """What came back from running a validation check or cross-check."""
211
+ check_name: str = Field(..., description="Name of the check that was run")
212
+ passed: bool = Field(..., description="Whether the check passed (True) or failed (False)")
213
+ detail: str = Field("", description="Human-readable detail of what was found")
214
+ timestamp: float = Field(default_factory=time.time, description="When the check was run")
215
+
216
+
217
+ class QueryResult(BaseModel):
218
+ """What came back from querying a supplier or internal department."""
219
+ target: str = Field(..., description="Who was queried (supplier, procurement, finance, etc.)")
220
+ question: str = Field("", description="The question that was asked")
221
+ response: str = Field(..., description="The response received")
222
+ channel: str = Field("email", description="Communication channel used (email, phone, etc.)")
223
+ timestamp: float = Field(default_factory=time.time, description="When the query was made")
224
+
225
+
226
+ # ---------------------------------------------------------------------------
227
+ # State models
228
+ # ---------------------------------------------------------------------------
229
+
230
+ class EnvironmentState(BaseModel):
231
+ """
232
+ The full observable state returned by reset() and step().
233
+
234
+ This is what the agent sees at every turn — all documents, all history,
235
+ and all available actions/checks/rules for the current task.
236
+ """
237
+ task_id: str = Field(..., description="Which task is currently running")
238
+ step_number: int = Field(0, description="Current step number in the episode")
239
+ case_status: CaseStatus = Field(CaseStatus.OPEN, description="Current lifecycle status")
240
+
241
+ # The five documents
242
+ purchase_order: PurchaseOrder = Field(..., description="The purchase order")
243
+ invoice: Invoice = Field(..., description="The invoice under review")
244
+ grn: GoodsReceiptNote = Field(..., description="The goods receipt note")
245
+ supplier_master: SupplierMaster = Field(..., description="The verified supplier record")
246
+ exception_flag: ExceptionFlag = Field(..., description="Why this invoice was flagged")
247
+
248
+ # Agent history — what has been done so far
249
+ inspections: List[InspectionResult] = Field(default_factory=list, description="Fields inspected")
250
+ checks_run: List[CheckResult] = Field(default_factory=list, description="Checks completed")
251
+ queries: List[QueryResult] = Field(default_factory=list, description="Queries made")
252
+ rules_applied: List[str] = Field(default_factory=list, description="Rules applied")
253
+
254
+ # Decision state
255
+ decision: Optional[str] = Field(None, description="Current decision if one has been made")
256
+ decision_reason: Optional[str] = Field(None, description="Reason for the decision")
257
+ routed_to: List[str] = Field(default_factory=list, description="Teams case has been routed to")
258
+ case_closed: bool = Field(False, description="Whether the case has been closed")
259
+ close_summary: Optional[str] = Field(None, description="Closure summary if case is closed")
260
+
261
+ # Action hints — what the agent can do
262
+ available_actions: List[str] = Field(default_factory=list, description="All valid action types")
263
+ available_checks: List[str] = Field(default_factory=list, description="Check names for this task")
264
+ available_rules: List[str] = Field(default_factory=list, description="Rule IDs for this task")
265
+ knowledge_base: List[str] = Field(default_factory=list, description="Policy entries for this task")
266
+
267
+ # Running totals
268
+ cumulative_reward: float = Field(0.0, description="Sum of all rewards received so far")
269
+
270
+
271
+ class StepResult(BaseModel):
272
+ """What step() returns — the observation, reward, done flag, and info dict."""
273
+ observation: EnvironmentState = Field(..., description="Updated environment state after the action")
274
+ reward: float = Field(..., description="Reward for this specific action")
275
+ done: bool = Field(False, description="Whether the episode is over")
276
+ info: Dict[str, Any] = Field(default_factory=dict, description="Extra info about the step")
env/tasks.py ADDED
@@ -0,0 +1,984 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Task definitions for the Invoice Exception Handler environment.
3
+
4
+ Each task defines a scenario with documents, simulator logic for every action
5
+ type, and a grader that produces sub-scores in [0.0, 1.0]. This is the biggest
6
+ file in the project — it contains all the business logic the environment needs.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import time
11
+ from typing import Any, Dict, List, Optional, Tuple
12
+
13
+ from .models import (
14
+ ActionType, CheckResult, ExceptionFlag, GoodsReceiptNote,
15
+ InspectionResult, Invoice, LineItem, PurchaseOrder, QueryResult,
16
+ SupplierMaster,
17
+ )
18
+
19
+
20
+ # ---------------------------------------------------------------------------
21
+ # EpisodeData — mutable state for one episode
22
+ # ---------------------------------------------------------------------------
23
+
24
+ class EpisodeData:
25
+ """Tracks the full history of one episode for grading and state building."""
26
+
27
+ def __init__(self) -> None:
28
+ self.inspections: List[InspectionResult] = []
29
+ self.checks: List[CheckResult] = []
30
+ self.queries: List[QueryResult] = []
31
+ self.rules_applied: List[str] = []
32
+ self.decision: Optional[str] = None
33
+ self.decision_reason: Optional[str] = None
34
+ self.routed_to: List[str] = []
35
+ self.closed: bool = False
36
+ self.close_summary: Optional[str] = None
37
+ self.step_count: int = 0
38
+ self.cumulative_reward: float = 0.0
39
+
40
+ def has_inspected(self, doc: str, field: str) -> bool:
41
+ """Check if we already looked at this field in this document."""
42
+ return any(i.document == doc and i.field == field for i in self.inspections)
43
+
44
+ def has_checked(self, name: str) -> bool:
45
+ """Check if this validation check has already been run."""
46
+ return any(c.check_name == name for c in self.checks)
47
+
48
+ def has_queried(self, target: str) -> bool:
49
+ """Check if we already queried this person or department."""
50
+ return any(q.target == target for q in self.queries)
51
+
52
+
53
+ # ---------------------------------------------------------------------------
54
+ # BaseTask — abstract interface
55
+ # ---------------------------------------------------------------------------
56
+
57
+ class BaseTask:
58
+ """Abstract base that all task classes inherit from."""
59
+
60
+ task_id: str = "base"
61
+ max_steps: int = 20
62
+ difficulty: str = "easy"
63
+
64
+ def get_purchase_order(self) -> PurchaseOrder:
65
+ raise NotImplementedError
66
+
67
+ def get_invoice(self) -> Invoice:
68
+ raise NotImplementedError
69
+
70
+ def get_grn(self) -> GoodsReceiptNote:
71
+ raise NotImplementedError
72
+
73
+ def get_supplier_master(self) -> SupplierMaster:
74
+ raise NotImplementedError
75
+
76
+ def get_exception_flag(self) -> ExceptionFlag:
77
+ raise NotImplementedError
78
+
79
+ def simulate_inspect(self, document: str, field: str) -> Tuple[InspectionResult, float]:
80
+ raise NotImplementedError
81
+
82
+ def simulate_cross_check(self, field: str, doc_a: str, doc_b: str) -> Tuple[CheckResult, float]:
83
+ raise NotImplementedError
84
+
85
+ def simulate_run_check(self, check_name: str) -> Tuple[CheckResult, float]:
86
+ raise NotImplementedError
87
+
88
+ def simulate_query_supplier(self, question: str, channel: str) -> Tuple[QueryResult, float]:
89
+ raise NotImplementedError
90
+
91
+ def simulate_query_internal(self, department: str, question: str) -> Tuple[QueryResult, float]:
92
+ raise NotImplementedError
93
+
94
+ def simulate_apply_rule(self, rule_id: str) -> Tuple[str, float]:
95
+ raise NotImplementedError
96
+
97
+ def simulate_make_decision(self, decision: str, reason: str, ep: EpisodeData) -> float:
98
+ raise NotImplementedError
99
+
100
+ def simulate_route_to(self, team: str, notes: str, ep: EpisodeData) -> float:
101
+ raise NotImplementedError
102
+
103
+ def simulate_close(self, summary: str, ep: EpisodeData) -> float:
104
+ raise NotImplementedError
105
+
106
+ def grade(self, ep: EpisodeData) -> Dict[str, float]:
107
+ raise NotImplementedError
108
+
109
+ @property
110
+ def available_checks(self) -> List[str]:
111
+ return []
112
+
113
+ @property
114
+ def available_rules(self) -> List[str]:
115
+ return []
116
+
117
+ @property
118
+ def knowledge_base(self) -> List[str]:
119
+ return []
120
+
121
+
122
+ # ---------------------------------------------------------------------------
123
+ # Task 1 — Price Variance Exception (Easy)
124
+ # ---------------------------------------------------------------------------
125
+
126
+ class PriceVarianceTask(BaseTask):
127
+ """
128
+ Office stationery invoice arrives 3.08% above the PO.
129
+ Company tolerance is +/-2% auto-approval. Supplier had verbal approval
130
+ from procurement for the price increase but the PO was never updated.
131
+
132
+ Optimal path: check tolerance -> cross-check prices -> verify GRN ->
133
+ query supplier -> query procurement -> apply exception rule -> approve ->
134
+ route to procurement for PO amendment -> close.
135
+ """
136
+
137
+ task_id = "task1_price_variance"
138
+ max_steps = 18
139
+ difficulty = "easy"
140
+
141
+ def get_purchase_order(self) -> PurchaseOrder:
142
+ return PurchaseOrder(
143
+ po_number="PO-2024-1041",
144
+ vendor_name="OfficeNeed Supplies",
145
+ po_date="2024-02-15",
146
+ line_items=[
147
+ LineItem(description="A4 Paper", quantity=100, unit_price=220.0, total=22000.0, tax_rate=18.0),
148
+ LineItem(description="Ballpoint Pens", quantity=20, unit_price=450.0, total=9000.0, tax_rate=18.0),
149
+ LineItem(description="Staplers", quantity=10, unit_price=1900.0, total=19000.0, tax_rate=18.0),
150
+ ],
151
+ total_amount=50000.0,
152
+ payment_terms="Net-30",
153
+ )
154
+
155
+ def get_invoice(self) -> Invoice:
156
+ return Invoice(
157
+ invoice_number="INV-ON-8821",
158
+ supplier_name="OfficeNeed Supplies",
159
+ invoice_date="2024-03-05",
160
+ due_date="2024-04-04",
161
+ po_reference="PO-2024-1041",
162
+ line_items=[
163
+ LineItem(description="A4 Paper", quantity=100, unit_price=231.0, total=23100.0, tax_rate=18.0),
164
+ LineItem(description="Ballpoint Pens", quantity=20, unit_price=472.0, total=9440.0, tax_rate=18.0),
165
+ LineItem(description="Staplers", quantity=10, unit_price=1900.0, total=19000.0, tax_rate=18.0),
166
+ ],
167
+ subtotal=51540.0,
168
+ tax_amount=9277.20,
169
+ tax_rate=18.0,
170
+ total_amount=60817.20,
171
+ bank_account="9876543210",
172
+ bank_name="HDFC Bank",
173
+ ifsc_code="HDFC0001234",
174
+ supplier_gstin="29AABCO1234F1Z5",
175
+ supplier_email="accounts@officeneed.com",
176
+ )
177
+
178
+ def get_grn(self) -> GoodsReceiptNote:
179
+ return GoodsReceiptNote(
180
+ grn_number="GRN-2024-0892",
181
+ po_reference="PO-2024-1041",
182
+ receipt_date="2024-03-01",
183
+ items_received=[
184
+ {"description": "A4 Paper", "quantity_received": 100, "quantity_pending": 0, "quantity_rejected": 0},
185
+ {"description": "Ballpoint Pens", "quantity_received": 20, "quantity_pending": 0, "quantity_rejected": 0},
186
+ {"description": "Staplers", "quantity_received": 10, "quantity_pending": 0, "quantity_rejected": 0},
187
+ ],
188
+ receiving_officer="Ramesh Kumar",
189
+ notes="All items received in good condition.",
190
+ )
191
+
192
+ def get_supplier_master(self) -> SupplierMaster:
193
+ return SupplierMaster(
194
+ supplier_id="SUP-0441",
195
+ supplier_name="OfficeNeed Supplies",
196
+ registered_address="45 MG Road, Bengaluru 560001",
197
+ gstin="29AABCO1234F1Z5",
198
+ bank_account="9876543210",
199
+ bank_name="HDFC Bank",
200
+ ifsc_code="HDFC0001234",
201
+ contact_email="sales@officeneed.com",
202
+ contact_phone="+91-80-4567-8901",
203
+ registered_domain="officeneed.com",
204
+ pan_number="AABCO1234F",
205
+ status="active",
206
+ )
207
+
208
+ def get_exception_flag(self) -> ExceptionFlag:
209
+ return ExceptionFlag(
210
+ flag_code="PRICE_MISMATCH",
211
+ flag_description=(
212
+ "Invoice total ₹51,540 exceeds PO ₹50,000 by ₹1,540 (3.08%). "
213
+ "Above auto-approval threshold."
214
+ ),
215
+ auto_hold=True,
216
+ flagged_date="2024-03-06",
217
+ severity="medium",
218
+ )
219
+
220
+ @property
221
+ def available_checks(self) -> List[str]:
222
+ return ["tolerance_rule", "grn_match", "duplicate_detection",
223
+ "bank_account_verification", "gst_verification", "po_match"]
224
+
225
+ @property
226
+ def available_rules(self) -> List[str]:
227
+ return ["tolerance_2pct_auto_approve", "tolerance_exception_approval",
228
+ "rejection_with_reason", "partial_approval"]
229
+
230
+ @property
231
+ def knowledge_base(self) -> List[str]:
232
+ return [
233
+ "POL-001: Price variance ≤±2% may be auto-approved. Above 2% requires exception approval.",
234
+ "POL-002: Exception approval requires confirmation from originating department.",
235
+ "POL-003: Any approved invoice with a price change must be followed by a PO amendment request.",
236
+ "POL-004: Bank account on invoice must match supplier master.",
237
+ ]
238
+
239
+ # --- Simulators ---
240
+
241
+ def simulate_inspect(self, document: str, field: str) -> Tuple[InspectionResult, float]:
242
+ """Return meaningful values for key fields, small reward for others."""
243
+ key_fields = {
244
+ ("invoice", "line_items"): ("A4 Paper @₹231 (+5%), Pens @₹472 (+4.9%), Staplers @₹1900 (unchanged)", 0.10),
245
+ ("invoice", "total_amount"): ("₹51,540 (subtotal) + ₹9,277.20 (GST 18%) = ₹60,817.20", 0.08),
246
+ ("po", "line_items"): ("A4 Paper @₹220, Pens @₹450, Staplers @₹1900. Total: ₹50,000", 0.06),
247
+ ("grn", "items_received"): ("All 3 items fully received. No pending, no rejected.", 0.05),
248
+ ("invoice", "bank_account"): ("9876543210 — HDFC Bank, IFSC HDFC0001234", 0.02),
249
+ ("invoice", "supplier_gstin"): ("29AABCO1234F1Z5", 0.02),
250
+ }
251
+ key = (document.lower(), field.lower())
252
+ value, reward = key_fields.get(key, (f"{document}.{field} — no anomaly detected", 0.01))
253
+ result = InspectionResult(document=document, field=field, value=value, note="")
254
+ return result, reward
255
+
256
+ def simulate_cross_check(self, field: str, doc_a: str, doc_b: str) -> Tuple[CheckResult, float]:
257
+ """Cross-check a field between two documents."""
258
+ checks = {
259
+ ("unit_price", "invoice", "po"): (False, "MISMATCH: A4 Paper ₹231 vs ₹220 (+5.0%), Pens ₹472 vs ₹450 (+4.9%). Staplers match.", 0.12),
260
+ ("total_amount", "invoice", "po"): (False, "Invoice subtotal ₹51,540 vs PO ₹50,000. Variance: +₹1,540 (+3.08%).", 0.10),
261
+ ("bank_account", "invoice", "supplier_master"): (True, "Bank account 9876543210 matches supplier master.", 0.03),
262
+ ("gstin", "invoice", "supplier_master"): (True, "GSTIN 29AABCO1234F1Z5 matches supplier master.", 0.02),
263
+ ("quantity", "invoice", "grn"): (True, "All quantities match: 100 reams, 20 boxes, 10 units.", 0.04),
264
+ }
265
+ key = (field.lower(), doc_a.lower(), doc_b.lower())
266
+ passed, detail, reward = checks.get(key, (True, f"No mismatch found for {field} between {doc_a} and {doc_b}.", 0.01))
267
+ result = CheckResult(check_name=f"cross_{field}_{doc_a}_{doc_b}", passed=passed, detail=detail)
268
+ return result, reward
269
+
270
+ def simulate_run_check(self, check_name: str) -> Tuple[CheckResult, float]:
271
+ """Run a named validation check."""
272
+ checks = {
273
+ "tolerance_rule": (False, "Price variance 3.08% exceeds ±2% auto-approval threshold. Manual exception approval required.", 0.14),
274
+ "grn_match": (True, "All items fully received. GRN matches invoice quantities.", 0.06),
275
+ "duplicate_detection": (True, "No duplicate invoice found in payment history.", 0.02),
276
+ "bank_account_verification": (True, "Bank account matches supplier master record.", 0.02),
277
+ "gst_verification": (True, "GSTIN matches supplier master. GST calculation correct.", 0.02),
278
+ "po_match": (False, "PO match FAILED on unit prices: 2 of 3 line items have price variance.", 0.08),
279
+ }
280
+ passed, detail, reward = checks.get(check_name, (True, f"Check '{check_name}' passed — no issues found.", 0.01))
281
+ result = CheckResult(check_name=check_name, passed=passed, detail=detail)
282
+ return result, reward
283
+
284
+ def simulate_query_supplier(self, question: str, channel: str) -> Tuple[QueryResult, float]:
285
+ """Query the supplier — returns email explaining the price increase."""
286
+ response = (
287
+ "Dear Sir/Madam, due to a 12% increase in raw material costs effective January 2024, "
288
+ "we revised prices for A4 Paper and Ballpoint Pens. This was communicated to Mr. Arjun Mehta "
289
+ "in your Procurement team via email on Feb 20, 2024. He acknowledged and verbally approved "
290
+ "the revised pricing. We can provide the email trail if needed. — OfficeNeed Supplies"
291
+ )
292
+ result = QueryResult(target="supplier", question=question, response=response, channel=channel)
293
+ return result, 0.10
294
+
295
+ def simulate_query_internal(self, department: str, question: str) -> Tuple[QueryResult, float]:
296
+ """Query an internal department."""
297
+ if department.lower() == "procurement":
298
+ response = (
299
+ "Hi, this is Arjun Mehta from Procurement. Yes, I received the price revision email "
300
+ "from OfficeNeed on Feb 20. I verbally approved it as the increase was reasonable "
301
+ "(raw material cost pass-through). I should have raised a PO amendment but it slipped. "
302
+ "I'll raise the amendment today. Please go ahead and approve the invoice."
303
+ )
304
+ return QueryResult(target="procurement", question=question, response=response, channel="internal"), 0.12
305
+ response = f"{department.title()} department: We don't have specific information about this invoice exception."
306
+ return QueryResult(target=department.lower(), question=question, response=response, channel="internal"), 0.03
307
+
308
+ def simulate_apply_rule(self, rule_id: str) -> Tuple[str, float]:
309
+ """Apply a business rule."""
310
+ rules = {
311
+ "tolerance_2pct_auto_approve": ("BLOCKED: Cannot auto-approve. Price variance 3.08% exceeds ±2% threshold.", -0.05),
312
+ "tolerance_exception_approval": ("APPLIED: Exception approval pathway activated. Requires department confirmation (obtained from procurement).", 0.10),
313
+ "rejection_with_reason": ("APPLIED: Rejection rule activated. Invoice will be returned to supplier.", -0.08),
314
+ "partial_approval": ("NOT APPLICABLE: All items received in full. Partial approval not warranted.", -0.05),
315
+ }
316
+ detail, reward = rules.get(rule_id, (f"Rule '{rule_id}' not found in policy database.", -0.03))
317
+ return detail, reward
318
+
319
+ def simulate_make_decision(self, decision: str, reason: str, ep: EpisodeData) -> float:
320
+ """Score the agent's decision based on evidence gathered."""
321
+ checks_run = {c.check_name for c in ep.checks}
322
+ queries_to = {q.target for q in ep.queries}
323
+
324
+ if decision == "approve":
325
+ if "tolerance_rule" in checks_run and "procurement" in queries_to:
326
+ return 0.25
327
+ elif "tolerance_rule" in checks_run:
328
+ return 0.18
329
+ else:
330
+ return 0.05
331
+ elif decision == "reject":
332
+ return -0.10
333
+ elif decision == "hold":
334
+ return 0.08
335
+ return 0.0
336
+
337
+ def simulate_route_to(self, team: str, notes: str, ep: EpisodeData) -> float:
338
+ """Score routing decisions."""
339
+ routes = {"procurement": 0.12, "finance": 0.03, "legal": -0.05}
340
+ return routes.get(team.lower(), 0.0)
341
+
342
+ def simulate_close(self, summary: str, ep: EpisodeData) -> float:
343
+ """Score case closure."""
344
+ checks_run = {c.check_name for c in ep.checks}
345
+ if ep.decision == "approve" and "tolerance_rule" in checks_run and "procurement" in set(ep.routed_to):
346
+ return 0.12
347
+ elif ep.decision is not None:
348
+ return 0.06
349
+ return 0.0
350
+
351
+ def grade(self, ep: EpisodeData) -> Dict[str, float]:
352
+ """Final grader producing sub-scores."""
353
+ checks_run = {c.check_name for c in ep.checks}
354
+ queries_to = {q.target for q in ep.queries}
355
+
356
+ # Diagnosis
357
+ d = 0.0
358
+ if any("unit_price" in c.check_name or "total" in c.check_name for c in ep.checks):
359
+ d += 0.12
360
+ if "tolerance_rule" in checks_run:
361
+ d += 0.14
362
+ if "grn_match" in checks_run:
363
+ d += 0.06
364
+
365
+ # Investigation
366
+ i = 0.0
367
+ if "supplier" in queries_to:
368
+ i += 0.10
369
+ if "procurement" in queries_to:
370
+ i += 0.12
371
+ if "tolerance_exception_approval" in ep.rules_applied:
372
+ i += 0.08
373
+
374
+ # Decision
375
+ dec = 0.0
376
+ if ep.decision == "approve":
377
+ dec += 0.18
378
+ elif ep.decision == "hold":
379
+ dec += 0.06
380
+ elif ep.decision == "reject":
381
+ dec -= 0.10
382
+
383
+ # Routing
384
+ route = 0.12 if "procurement" in ep.routed_to else 0.0
385
+
386
+ # Closure
387
+ closure = 0.08 if ep.closed else 0.0
388
+
389
+ # Efficiency
390
+ eff = max(0.0, 0.06 - 0.004 * max(0, ep.step_count - 9))
391
+
392
+ total = d + i + dec + route + closure + eff
393
+ return {
394
+ "score": round(max(0.0, min(1.0, total)), 4),
395
+ "diagnosis_score": round(d, 4),
396
+ "investigation_score": round(i, 4),
397
+ "decision_score": round(dec, 4),
398
+ "routing_score": round(route, 4),
399
+ "closure_score": round(closure, 4),
400
+ "efficiency_score": round(eff, 4),
401
+ }
402
+
403
+
404
+ # ---------------------------------------------------------------------------
405
+ # Task 2 — Duplicate Invoice with Hidden Tax Error (Medium)
406
+ # ---------------------------------------------------------------------------
407
+
408
+ class DuplicateTaxErrorTask(BaseTask):
409
+ """
410
+ Logistics supplier submits INV-2024-891 which is a duplicate of already-paid
411
+ INV-2024-819 (digit transposition). The original invoice applied 15% GST
412
+ (wrong), correct rate is 18%. Company overpaid ₹3,240. The new invoice has
413
+ the correct rate. It's both a duplicate AND a legitimate correction.
414
+ """
415
+
416
+ task_id = "task2_duplicate_tax"
417
+ max_steps = 20
418
+ difficulty = "medium"
419
+
420
+ def get_purchase_order(self) -> PurchaseOrder:
421
+ return PurchaseOrder(
422
+ po_number="PO-2024-0778",
423
+ vendor_name="FastMove Logistics",
424
+ po_date="2024-01-25",
425
+ line_items=[
426
+ LineItem(description="Mumbai-Pune Transport", quantity=20, unit_price=4500.0, total=90000.0, tax_rate=18.0),
427
+ LineItem(description="Warehousing charges Feb 2024", quantity=1, unit_price=18000.0, total=18000.0, tax_rate=18.0),
428
+ ],
429
+ total_amount=108000.0,
430
+ payment_terms="Net-15",
431
+ )
432
+
433
+ def get_invoice(self) -> Invoice:
434
+ return Invoice(
435
+ invoice_number="INV-2024-891",
436
+ supplier_name="FastMove Logistics",
437
+ invoice_date="2024-03-12",
438
+ due_date="2024-03-27",
439
+ po_reference="PO-2024-0778",
440
+ line_items=[
441
+ LineItem(description="Mumbai-Pune Transport", quantity=20, unit_price=4500.0, total=90000.0, tax_rate=18.0),
442
+ LineItem(description="Warehousing charges Feb 2024", quantity=1, unit_price=18000.0, total=18000.0, tax_rate=18.0),
443
+ ],
444
+ subtotal=108000.0,
445
+ tax_amount=19440.0,
446
+ tax_rate=18.0,
447
+ total_amount=127440.0,
448
+ bank_account="1122334455",
449
+ bank_name="ICICI Bank",
450
+ ifsc_code="ICIC0005678",
451
+ supplier_gstin="27AABCF5678G1Z3",
452
+ supplier_email="billing@fastmove.in",
453
+ )
454
+
455
+ def get_grn(self) -> GoodsReceiptNote:
456
+ return GoodsReceiptNote(
457
+ grn_number="GRN-2024-0740",
458
+ po_reference="PO-2024-0778",
459
+ receipt_date="2024-02-28",
460
+ items_received=[
461
+ {"description": "Mumbai-Pune Transport", "quantity_received": 20, "quantity_pending": 0, "quantity_rejected": 0, "service_confirmed": True},
462
+ {"description": "Warehousing charges Feb 2024", "quantity_received": 1, "quantity_pending": 0, "quantity_rejected": 0, "service_confirmed": True},
463
+ ],
464
+ receiving_officer="Priya Sharma",
465
+ notes="All transport trips completed. Warehousing service confirmed for February.",
466
+ )
467
+
468
+ def get_supplier_master(self) -> SupplierMaster:
469
+ return SupplierMaster(
470
+ supplier_id="SUP-0229",
471
+ supplier_name="FastMove Logistics",
472
+ registered_address="12 Logistics Park, Navi Mumbai 400710",
473
+ gstin="27AABCF5678G1Z3",
474
+ bank_account="1122334455",
475
+ bank_name="ICICI Bank",
476
+ ifsc_code="ICIC0005678",
477
+ contact_email="accounts@fastmove.in",
478
+ contact_phone="+91-22-3456-7890",
479
+ registered_domain="fastmove.in",
480
+ pan_number="AABCF5678G",
481
+ status="active",
482
+ )
483
+
484
+ def get_exception_flag(self) -> ExceptionFlag:
485
+ return ExceptionFlag(
486
+ flag_code="POSSIBLE_DUPLICATE",
487
+ flag_description="Invoice INV-2024-891 closely matches previously processed invoice INV-2024-819. Possible duplicate submission.",
488
+ auto_hold=True,
489
+ flagged_date="2024-03-13",
490
+ severity="high",
491
+ )
492
+
493
+ @property
494
+ def available_checks(self) -> List[str]:
495
+ return ["duplicate_detection", "tax_calculation_verify", "grn_match",
496
+ "bank_account_verification", "gst_verification", "po_match"]
497
+
498
+ @property
499
+ def available_rules(self) -> List[str]:
500
+ return ["partial_approval", "credit_note_request", "full_rejection",
501
+ "duplicate_block", "tax_correction"]
502
+
503
+ @property
504
+ def knowledge_base(self) -> List[str]:
505
+ return [
506
+ "POL-005: Duplicate invoices must be rejected unless they represent a legitimate correction.",
507
+ "POL-006: Tax calculation errors on paid invoices require a credit note and correction entry.",
508
+ "POL-007: Partial approval may be used when only a portion of the invoice amount is valid.",
509
+ "POL-008: Any tax correction must be documented with the original invoice reference.",
510
+ ]
511
+
512
+ def simulate_inspect(self, document: str, field: str) -> Tuple[InspectionResult, float]:
513
+ key_fields = {
514
+ ("invoice", "invoice_number"): ("INV-2024-891 — note digit transposition vs INV-2024-819 (891 vs 819)", 0.10),
515
+ ("invoice", "tax_amount"): ("₹19,440 (18% GST on ₹1,08,000) — this is the CORRECT rate", 0.08),
516
+ ("invoice", "total_amount"): ("₹1,27,440 (subtotal ₹1,08,000 + 18% GST ₹19,440)", 0.05),
517
+ ("invoice", "line_items"): ("Transport 20×₹4,500 = ₹90,000 + Warehousing ₹18,000 = ₹1,08,000", 0.04),
518
+ }
519
+ key = (document.lower(), field.lower())
520
+ value, reward = key_fields.get(key, (f"{document}.{field} — no anomaly detected", 0.01))
521
+ return InspectionResult(document=document, field=field, value=value, note=""), reward
522
+
523
+ def simulate_cross_check(self, field: str, doc_a: str, doc_b: str) -> Tuple[CheckResult, float]:
524
+ checks = {
525
+ ("invoice_number", "invoice", "payment_history"): (False, "MATCH FOUND: INV-2024-819 paid 12 days ago for ₹1,24,200. Digit transposition: 891 vs 819.", 0.15),
526
+ ("tax_amount", "invoice", "payment_history"): (False, "TAX DISCREPANCY: Original INV-2024-819 had 15% GST (₹16,200). Current INV-2024-891 has 18% GST (₹19,440). Delta: ₹3,240.", 0.14),
527
+ ("total_amount", "invoice", "po"): (True, "Invoice subtotal ₹1,08,000 matches PO total ₹1,08,000.", 0.03),
528
+ ("bank_account", "invoice", "supplier_master"): (True, "Bank account matches supplier master.", 0.02),
529
+ }
530
+ key = (field.lower(), doc_a.lower(), doc_b.lower())
531
+ passed, detail, reward = checks.get(key, (True, f"No mismatch for {field}.", 0.01))
532
+ return CheckResult(check_name=f"cross_{field}_{doc_a}_{doc_b}", passed=passed, detail=detail), reward
533
+
534
+ def simulate_run_check(self, check_name: str) -> Tuple[CheckResult, float]:
535
+ checks = {
536
+ "duplicate_detection": (False, "DUPLICATE FOUND: INV-2024-891 matches INV-2024-819 (paid 12 days ago, ₹1,24,200). Invoice numbers differ by digit transposition (891 vs 819).", 0.18),
537
+ "tax_calculation_verify": (False, "TAX ERROR on ORIGINAL: INV-2024-819 applied 15% GST (₹16,200) instead of correct 18% (₹19,440). Company overpaid ₹3,240 in tax on already-paid invoice.", 0.16),
538
+ "grn_match": (True, "Services fully confirmed. GRN matches invoice.", 0.04),
539
+ "bank_account_verification": (True, "Bank account matches supplier master.", 0.02),
540
+ "gst_verification": (True, "GSTIN matches supplier master.", 0.02),
541
+ "po_match": (True, "PO amounts and line items match current invoice.", 0.03),
542
+ }
543
+ passed, detail, reward = checks.get(check_name, (True, f"Check '{check_name}' passed.", 0.01))
544
+ return CheckResult(check_name=check_name, passed=passed, detail=detail), reward
545
+
546
+ def simulate_query_supplier(self, question: str, channel: str) -> Tuple[QueryResult, float]:
547
+ response = (
548
+ "We are aware that INV-2024-819 was submitted with incorrect 15% GST. The correct rate "
549
+ "is 18%. INV-2024-891 is a corrected resubmission. We request partial approval for the "
550
+ "₹3,240 tax differential only, not the full invoice amount. We will issue a credit note "
551
+ "for the remaining amount."
552
+ )
553
+ return QueryResult(target="supplier", question=question, response=response, channel=channel), 0.10
554
+
555
+ def simulate_query_internal(self, department: str, question: str) -> Tuple[QueryResult, float]:
556
+ if department.lower() == "finance":
557
+ response = (
558
+ "Confirmed: INV-2024-819 was paid on March 1 for ₹1,24,200 (₹1,08,000 + 15% GST of "
559
+ "₹16,200). The correct GST rate for logistics services is 18%. We overpaid — the "
560
+ "correct total should have been ₹1,27,440. The tax differential is ₹3,240. This "
561
+ "can be corrected via partial approval of the new invoice for ₹3,240 only."
562
+ )
563
+ return QueryResult(target="finance", question=question, response=response, channel="internal"), 0.12
564
+ response = f"{department.title()}: No specific information available."
565
+ return QueryResult(target=department.lower(), question=question, response=response, channel="internal"), 0.03
566
+
567
+ def simulate_apply_rule(self, rule_id: str) -> Tuple[str, float]:
568
+ rules = {
569
+ "partial_approval": ("APPLIED: Partial approval for ₹3,240 (tax correction delta). Main invoice amount blocked as duplicate.", 0.12),
570
+ "credit_note_request": ("APPLIED: Credit note requested from supplier for balance amount. Reference: INV-2024-819 tax correction.", 0.10),
571
+ "full_rejection": ("APPLIED: Full rejection. Invoice returned to supplier.", -0.05),
572
+ "duplicate_block": ("APPLIED: Duplicate block activated. Full payment prevented.", 0.04),
573
+ "tax_correction": ("APPLIED: Tax correction entry created referencing original INV-2024-819.", 0.08),
574
+ }
575
+ detail, reward = rules.get(rule_id, (f"Rule '{rule_id}' not found.", -0.03))
576
+ return detail, reward
577
+
578
+ def simulate_make_decision(self, decision: str, reason: str, ep: EpisodeData) -> float:
579
+ checks_run = {c.check_name for c in ep.checks}
580
+ dup_found = "duplicate_detection" in checks_run
581
+ tax_found = "tax_calculation_verify" in checks_run
582
+
583
+ if decision == "partial_approve":
584
+ if dup_found and tax_found:
585
+ return 0.28
586
+ elif dup_found:
587
+ return 0.14
588
+ return 0.06
589
+ elif decision == "reject":
590
+ if dup_found:
591
+ return 0.08
592
+ return 0.02
593
+ elif decision == "approve":
594
+ return -0.15
595
+ elif decision == "hold":
596
+ return 0.06
597
+ return 0.0
598
+
599
+ def simulate_route_to(self, team: str, notes: str, ep: EpisodeData) -> float:
600
+ routes = {"finance": 0.08, "procurement": 0.03, "legal": 0.02}
601
+ return routes.get(team.lower(), 0.0)
602
+
603
+ def simulate_close(self, summary: str, ep: EpisodeData) -> float:
604
+ if ep.decision == "partial_approve" and ep.closed is False:
605
+ return 0.06
606
+ elif ep.decision is not None:
607
+ return 0.03
608
+ return 0.0
609
+
610
+ def grade(self, ep: EpisodeData) -> Dict[str, float]:
611
+ checks_run = {c.check_name for c in ep.checks}
612
+ queries_to = {q.target for q in ep.queries}
613
+
614
+ # Diagnosis (max 0.30)
615
+ d = 0.0
616
+ if "duplicate_detection" in checks_run:
617
+ d += 0.16
618
+ if "tax_calculation_verify" in checks_run:
619
+ d += 0.14
620
+
621
+ # Investigation (max 0.32)
622
+ i = 0.0
623
+ if "finance" in queries_to:
624
+ i += 0.12
625
+ if "supplier" in queries_to:
626
+ i += 0.10
627
+ if "partial_approval" in ep.rules_applied:
628
+ i += 0.06
629
+ if "credit_note_request" in ep.rules_applied:
630
+ i += 0.04
631
+
632
+ # Decision (max 0.20)
633
+ dec = 0.0
634
+ if ep.decision == "partial_approve":
635
+ dec = 0.20
636
+ elif ep.decision == "reject":
637
+ dec = 0.05
638
+ elif ep.decision == "approve":
639
+ dec = -0.15
640
+ elif ep.decision == "hold":
641
+ dec = 0.04
642
+
643
+ # Routing (max 0.08)
644
+ route = 0.08 if "finance" in ep.routed_to else 0.0
645
+
646
+ # Closure (max 0.06)
647
+ closure = 0.06 if ep.closed else 0.0
648
+
649
+ # Efficiency
650
+ eff = max(0.0, 0.04 - 0.003 * max(0, ep.step_count - 10))
651
+
652
+ total = d + i + dec + route + closure + eff
653
+ return {
654
+ "score": round(max(0.0, min(1.0, total)), 4),
655
+ "diagnosis_score": round(d, 4),
656
+ "investigation_score": round(i, 4),
657
+ "decision_score": round(dec, 4),
658
+ "routing_score": round(route, 4),
659
+ "closure_score": round(closure, 4),
660
+ "efficiency_score": round(eff, 4),
661
+ }
662
+
663
+
664
+ # ---------------------------------------------------------------------------
665
+ # Task 3 — Compound Fraud Signals (Hard)
666
+ # ---------------------------------------------------------------------------
667
+
668
+ class CompoundFraudTask(BaseTask):
669
+ """
670
+ IT supplier submits ₹8,47,500 invoice for 15 laptops. System flags a bank
671
+ account change. But there are FOUR simultaneous fraud signals: bank BEC,
672
+ GSTIN mismatch, quantity mismatch (13 vs 15), and price inflation (8.65%).
673
+
674
+ Critical trap: querying supplier via email contacts the fraudster (-0.15).
675
+ Must use phone to reach real supplier (+0.15).
676
+ """
677
+
678
+ task_id = "task3_compound_fraud"
679
+ max_steps = 25
680
+ difficulty = "hard"
681
+
682
+ def get_purchase_order(self) -> PurchaseOrder:
683
+ return PurchaseOrder(
684
+ po_number="PO-2024-0955",
685
+ vendor_name="TechCore Solutions",
686
+ po_date="2024-03-08",
687
+ line_items=[
688
+ LineItem(description="Business Laptop (14-inch, i7, 16GB)", quantity=15, unit_price=52000.0, total=780000.0, tax_rate=18.0),
689
+ ],
690
+ total_amount=780000.0,
691
+ payment_terms="Net-30",
692
+ )
693
+
694
+ def get_invoice(self) -> Invoice:
695
+ return Invoice(
696
+ invoice_number="INV-TC-2024-0312",
697
+ supplier_name="TechCore Solutions",
698
+ invoice_date="2024-03-10",
699
+ due_date="2024-04-09",
700
+ po_reference="PO-2024-0955",
701
+ line_items=[
702
+ LineItem(description="Business Laptop (14-inch, i7, 16GB)", quantity=15, unit_price=56500.0, total=847500.0, tax_rate=18.0),
703
+ ],
704
+ subtotal=847500.0,
705
+ tax_amount=152550.0,
706
+ tax_rate=18.0,
707
+ total_amount=1000050.0,
708
+ bank_account="5566778899",
709
+ bank_name="Yes Bank",
710
+ ifsc_code="YESB0000999",
711
+ supplier_gstin="07AABCT9999X1Z8",
712
+ supplier_email="accounts@techcore-solutions.com",
713
+ )
714
+
715
+ def get_grn(self) -> GoodsReceiptNote:
716
+ return GoodsReceiptNote(
717
+ grn_number="GRN-2024-0901",
718
+ po_reference="PO-2024-0955",
719
+ receipt_date="2024-03-15",
720
+ items_received=[
721
+ {"description": "Business Laptop (14-inch, i7, 16GB)", "quantity_received": 13, "quantity_pending": 2, "quantity_rejected": 0},
722
+ ],
723
+ receiving_officer="Vikram Singh",
724
+ notes="13 of 15 laptops received. 2 units still in transit.",
725
+ )
726
+
727
+ def get_supplier_master(self) -> SupplierMaster:
728
+ return SupplierMaster(
729
+ supplier_id="SUP-0187",
730
+ supplier_name="TechCore Solutions",
731
+ registered_address="88 Tech Park, Sector 62, Noida 201301",
732
+ gstin="07AABCT1234Y1Z5",
733
+ bank_account="1234567890",
734
+ bank_name="State Bank of India",
735
+ ifsc_code="SBIN0001234",
736
+ contact_email="sales@techcore-solutions.in",
737
+ contact_phone="+91-120-456-7890",
738
+ registered_domain="techcore-solutions.in",
739
+ pan_number="AABCT1234Y",
740
+ status="active",
741
+ )
742
+
743
+ def get_exception_flag(self) -> ExceptionFlag:
744
+ return ExceptionFlag(
745
+ flag_code="BANK_ACCOUNT_CHANGE",
746
+ flag_description=(
747
+ "Invoice bank account (5566778899, Yes Bank) does not match supplier master "
748
+ "(1234567890, SBI). Bank account change request received from "
749
+ "accounts@techcore-solutions.com."
750
+ ),
751
+ auto_hold=True,
752
+ flagged_date="2024-03-16",
753
+ severity="critical",
754
+ )
755
+
756
+ @property
757
+ def available_checks(self) -> List[str]:
758
+ return ["bank_account_verification", "gst_verification", "grn_match",
759
+ "email_domain_verification", "invoice_date_validation",
760
+ "quantity_check", "price_check", "duplicate_detection", "po_match"]
761
+
762
+ @property
763
+ def available_rules(self) -> List[str]:
764
+ return ["fraud_hold", "rejection_with_reason", "bank_change_verification",
765
+ "escalate_to_security"]
766
+
767
+ @property
768
+ def knowledge_base(self) -> List[str]:
769
+ return [
770
+ "POL-004: Bank account on invoice must match supplier master.",
771
+ "POL-009: Bank account change must be verified via registered phone number — NEVER via email.",
772
+ "POL-010: GSTIN on invoice must match supplier master. Mismatch is a fraud indicator.",
773
+ "POL-011: Invoice quantities must not exceed GRN quantities.",
774
+ "POL-012: Any suspected fraud must be escalated to Legal and Security teams.",
775
+ "POL-013: Do not process payment while fraud investigation is pending.",
776
+ ]
777
+
778
+ def simulate_inspect(self, document: str, field: str) -> Tuple[InspectionResult, float]:
779
+ key_fields = {
780
+ ("invoice", "bank_account"): ("5566778899 (Yes Bank) — DOES NOT MATCH supplier master (1234567890, SBI)", 0.12),
781
+ ("invoice", "supplier_gstin"): ("07AABCT9999X1Z8 — DOES NOT MATCH supplier master (07AABCT1234Y1Z5)", 0.10),
782
+ ("invoice", "supplier_email"): ("accounts@techcore-solutions.com — domain is .com, registered domain is .in", 0.08),
783
+ ("grn", "items_received"): ("13 of 15 laptops received. 2 pending delivery.", 0.08),
784
+ ("invoice", "line_items"): ("15 laptops @ ₹56,500 = ₹8,47,500. PO price was ₹52,000/unit.", 0.06),
785
+ ("invoice", "invoice_date"): ("2024-03-10 (Sunday) — unusual for B2B invoicing", 0.04),
786
+ ("invoice", "total_amount"): ("₹10,00,050 (₹8,47,500 + 18% GST ₹1,52,550)", 0.03),
787
+ }
788
+ key = (document.lower(), field.lower())
789
+ value, reward = key_fields.get(key, (f"{document}.{field} — value noted", 0.01))
790
+ return InspectionResult(document=document, field=field, value=value, note=""), reward
791
+
792
+ def simulate_cross_check(self, field: str, doc_a: str, doc_b: str) -> Tuple[CheckResult, float]:
793
+ checks = {
794
+ ("bank_account", "invoice", "supplier_master"): (False, "MISMATCH: Invoice has 5566778899 (Yes Bank). Supplier master has 1234567890 (SBI). Change request from lookalike domain.", 0.14),
795
+ ("gstin", "invoice", "supplier_master"): (False, "MISMATCH: Invoice GSTIN 07AABCT9999X1Z8 belongs to 'TechCore Trading Pvt Ltd' (different entity). Supplier master: 07AABCT1234Y1Z5.", 0.14),
796
+ ("quantity", "invoice", "grn"): (False, "MISMATCH: Invoice claims 15 units. GRN shows only 13 received, 2 pending.", 0.10),
797
+ ("unit_price", "invoice", "po"): (False, "MISMATCH: Invoice ₹56,500/unit vs PO ₹52,000/unit. Variance: +8.65%. No approved revision.", 0.08),
798
+ }
799
+ key = (field.lower(), doc_a.lower(), doc_b.lower())
800
+ passed, detail, reward = checks.get(key, (True, f"No mismatch for {field}.", 0.01))
801
+ return CheckResult(check_name=f"cross_{field}_{doc_a}_{doc_b}", passed=passed, detail=detail), reward
802
+
803
+ def simulate_run_check(self, check_name: str) -> Tuple[CheckResult, float]:
804
+ checks = {
805
+ "bank_account_verification": (False, "FAILED: Bank account mismatch. Change request from techcore-solutions.com (lookalike of registered .in domain). Suspected BEC attack.", 0.18),
806
+ "gst_verification": (False, "FAILED: GSTIN 07AABCT9999X1Z8 registered to 'TechCore Trading Pvt Ltd' in Delhi — a DIFFERENT entity. Supplier master shows 07AABCT1234Y1Z5 for 'TechCore Solutions'.", 0.18),
807
+ "grn_match": (False, "FAILED: Invoice claims 15 laptops. GRN confirms only 13 received. 2 units still in transit.", 0.14),
808
+ "email_domain_verification": (False, "FAILED: Invoice email domain techcore-solutions.com does not match registered domain techcore-solutions.in. Lookalike domain — possible BEC.", 0.16),
809
+ "invoice_date_validation": (False, "WARNING: Invoice dated 2024-03-10 is a Sunday. Unusual for B2B invoicing.", 0.08),
810
+ "quantity_check": (False, "FAILED: Invoiced quantity (15) exceeds received quantity (13). 2 units undelivered.", 0.12),
811
+ "price_check": (False, "FAILED: Unit price ₹56,500 is 8.65% above PO price ₹52,000. No price revision approved.", 0.10),
812
+ "duplicate_detection": (True, "No duplicate invoice found.", 0.02),
813
+ "po_match": (False, "FAILED: Multiple mismatches — GSTIN, quantity, and unit price all differ from PO/supplier master.", 0.08),
814
+ }
815
+ passed, detail, reward = checks.get(check_name, (True, f"Check '{check_name}' passed.", 0.01))
816
+ return CheckResult(check_name=check_name, passed=passed, detail=detail), reward
817
+
818
+ def simulate_query_supplier(self, question: str, channel: str) -> Tuple[QueryResult, float]:
819
+ """Critical trap: email contacts the fraudster, phone reaches real supplier."""
820
+ if channel.lower() in ("email", "mail"):
821
+ response = (
822
+ "Dear Customer, thank you for reaching out. We recently changed our banking details "
823
+ "due to a corporate restructuring. The new account (5566778899, Yes Bank) is correct. "
824
+ "Please process the payment at your earliest convenience. We can provide updated "
825
+ "KYC documents if needed. — TechCore Solutions Finance Team"
826
+ )
827
+ return QueryResult(target="supplier", question=question, response=response, channel=channel), -0.15
828
+ else:
829
+ response = (
830
+ "This is Rajesh from TechCore Solutions. We have NOT sent any bank account change "
831
+ "request. Our bank account is still 1234567890 with SBI. We also have not sent any "
832
+ "invoice from techcore-solutions.com — our domain is techcore-solutions.in. This "
833
+ "looks like a fraud attempt. Please do NOT process payment to the new account. "
834
+ "We will file a complaint with our legal team."
835
+ )
836
+ return QueryResult(target="supplier", question=question, response=response, channel=channel), 0.15
837
+
838
+ def simulate_query_internal(self, department: str, question: str) -> Tuple[QueryResult, float]:
839
+ dept = department.lower()
840
+ if dept == "security":
841
+ response = (
842
+ "Security Team: We will initiate a BEC investigation. The lookalike domain "
843
+ "techcore-solutions.com was registered 3 days before the invoice date. This is "
844
+ "a classic Business Email Compromise pattern. Do NOT process any payment. "
845
+ "We are preserving email headers for forensic analysis."
846
+ )
847
+ return QueryResult(target="security", question=question, response=response, channel="internal"), 0.10
848
+ elif dept == "legal":
849
+ response = (
850
+ "Legal Team: Based on the fraud indicators you've documented, we recommend: "
851
+ "1) Immediate payment block, 2) Formal complaint to cybercrime authorities, "
852
+ "3) Supplier audit of TechCore Solutions, 4) Review of all recent invoices "
853
+ "from this supplier."
854
+ )
855
+ return QueryResult(target="legal", question=question, response=response, channel="internal"), 0.08
856
+ elif dept == "finance":
857
+ response = "Finance: Payment has been blocked pending investigation. No funds released."
858
+ return QueryResult(target="finance", question=question, response=response, channel="internal"), 0.04
859
+ elif dept == "procurement":
860
+ response = "Procurement: PO-2024-0955 was raised on March 8. Standard 2-day processing for IT equipment."
861
+ return QueryResult(target="procurement", question=question, response=response, channel="internal"), 0.03
862
+ response = f"{department.title()}: No specific information available."
863
+ return QueryResult(target=dept, question=question, response=response, channel="internal"), 0.02
864
+
865
+ def simulate_apply_rule(self, rule_id: str) -> Tuple[str, float]:
866
+ rules = {
867
+ "fraud_hold": ("APPLIED: Fraud hold activated. All payments to this supplier frozen pending investigation.", 0.10),
868
+ "rejection_with_reason": ("APPLIED: Invoice rejected with documented fraud signals.", 0.06),
869
+ "bank_change_verification": ("APPLIED: Bank change verification protocol activated. Phone verification required per POL-009.", 0.08),
870
+ "escalate_to_security": ("APPLIED: Case escalated to Information Security team for BEC investigation.", 0.08),
871
+ }
872
+ detail, reward = rules.get(rule_id, (f"Rule '{rule_id}' not applicable.", -0.03))
873
+ return detail, reward
874
+
875
+ def simulate_make_decision(self, decision: str, reason: str, ep: EpisodeData) -> float:
876
+ checks_run = {c.check_name for c in ep.checks}
877
+ signals = sum([
878
+ "bank_account_verification" in checks_run,
879
+ "gst_verification" in checks_run,
880
+ "grn_match" in checks_run,
881
+ "email_domain_verification" in checks_run,
882
+ ])
883
+
884
+ if decision == "reject":
885
+ return 0.10 + 0.05 * signals
886
+ elif decision == "approve":
887
+ return -0.40
888
+ elif decision == "partial_approve":
889
+ return -0.20
890
+ elif decision == "hold":
891
+ return 0.08 + 0.03 * signals
892
+ return 0.0
893
+
894
+ def simulate_route_to(self, team: str, notes: str, ep: EpisodeData) -> float:
895
+ routes = {"legal": 0.14, "security": 0.12, "finance": 0.08, "procurement": 0.06}
896
+ return routes.get(team.lower(), 0.0)
897
+
898
+ def simulate_close(self, summary: str, ep: EpisodeData) -> float:
899
+ if ep.closed is False and ep.decision == "reject":
900
+ return 0.06
901
+ return 0.0
902
+
903
+ def grade(self, ep: EpisodeData) -> Dict[str, float]:
904
+ checks_run = {c.check_name for c in ep.checks}
905
+
906
+ bank_found = "bank_account_verification" in checks_run
907
+ gst_found = "gst_verification" in checks_run
908
+ qty_found = "grn_match" in checks_run
909
+ domain_found = "email_domain_verification" in checks_run
910
+ price_found = "price_check" in checks_run
911
+
912
+ # Diagnosis
913
+ d = ((0.12 if bank_found else 0) + (0.12 if gst_found else 0)
914
+ + (0.10 if qty_found else 0) + (0.10 if domain_found else 0)
915
+ + (0.06 if price_found else 0))
916
+
917
+ # Investigation — reward phone, penalise email
918
+ i = 0.0
919
+ for q in ep.queries:
920
+ if q.target == "supplier" and q.channel not in ("email", "mail"):
921
+ i += 0.10
922
+ elif q.target == "supplier" and q.channel in ("email", "mail"):
923
+ i -= 0.15
924
+ if "legal" in {q.target for q in ep.queries}:
925
+ i += 0.06
926
+ if "security" in {q.target for q in ep.queries}:
927
+ i += 0.06
928
+
929
+ # Decision
930
+ signals = sum([bank_found, gst_found, qty_found, domain_found])
931
+ dec = 0.0
932
+ if ep.decision == "reject":
933
+ dec = 0.08 + 0.03 * signals
934
+ elif ep.decision == "approve":
935
+ dec = -0.35
936
+ elif ep.decision == "partial_approve":
937
+ dec = -0.15
938
+ elif ep.decision == "hold":
939
+ dec = 0.06
940
+
941
+ # Routing
942
+ routes = set(ep.routed_to)
943
+ route = ((0.10 if "legal" in routes else 0)
944
+ + (0.06 if "security" in routes else 0)
945
+ + (0.04 if "finance" in routes else 0))
946
+
947
+ # Closure
948
+ closure = 0.06 if (ep.closed and ep.decision == "reject") else 0.0
949
+
950
+ # Efficiency
951
+ eff = max(0.0, 0.04 - 0.002 * max(0, ep.step_count - 12))
952
+
953
+ total = d + i + dec + route + closure + eff
954
+ return {
955
+ "score": round(max(0.0, min(1.0, total)), 4),
956
+ "signals_found": sum([bank_found, gst_found, qty_found, domain_found, price_found]),
957
+ "diagnosis_score": round(d, 4),
958
+ "investigation_score": round(i, 4),
959
+ "decision_score": round(dec, 4),
960
+ "routing_score": round(route, 4),
961
+ "closure_score": round(closure, 4),
962
+ "efficiency_score": round(eff, 4),
963
+ }
964
+
965
+
966
+ # ---------------------------------------------------------------------------
967
+ # Task Registry
968
+ # ---------------------------------------------------------------------------
969
+
970
+ TASK_REGISTRY: Dict[str, type] = {
971
+ "task1_price_variance": PriceVarianceTask,
972
+ "task2_duplicate_tax": DuplicateTaxErrorTask,
973
+ "task3_compound_fraud": CompoundFraudTask,
974
+ }
975
+
976
+ ALL_TASKS = list(TASK_REGISTRY.keys())
977
+
978
+
979
+ def make_task(task_id: str) -> BaseTask:
980
+ """Instantiate a task by its ID. Raises ValueError for unknown IDs."""
981
+ cls = TASK_REGISTRY.get(task_id)
982
+ if cls is None:
983
+ raise ValueError(f"Unknown task '{task_id}'. Available: {ALL_TASKS}")
984
+ return cls()
inference.py ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Competition inference script for the Invoice Exception Handler environment.
3
+
4
+ Uses the OpenAI client to call an LLM that acts as an AP analyst.
5
+ Reads API_BASE_URL, MODEL_NAME, HF_TOKEN from environment variables.
6
+ Emits [START], [STEP], [END] lines to stdout as required by the spec.
7
+
8
+ Usage:
9
+ export API_BASE_URL="https://router.huggingface.co/v1"
10
+ export MODEL_NAME="Qwen/Qwen2.5-72B-Instruct"
11
+ export HF_TOKEN="your-token"
12
+ python inference.py
13
+ """
14
+ from __future__ import annotations
15
+
16
+ import json
17
+ import os
18
+ import re
19
+ import sys
20
+
21
+ from openai import OpenAI
22
+
23
+ from env import InvoiceExceptionEnv, Action, ALL_TASKS
24
+
25
+ # ---------------------------------------------------------------------------
26
+ # Configuration from environment variables
27
+ # ---------------------------------------------------------------------------
28
+
29
+ API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
30
+ MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
31
+ API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY", "")
32
+
33
+
34
+ # ---------------------------------------------------------------------------
35
+ # System prompt — tells the LLM how to act
36
+ # ---------------------------------------------------------------------------
37
+
38
+ SYSTEM_PROMPT = """You are an expert Accounts Payable (AP) analyst handling flagged invoice exceptions.
39
+
40
+ You have access to a document packet: Purchase Order (PO), Invoice, Goods Receipt Note (GRN), Supplier Master, and an Exception Flag explaining why this invoice was flagged.
41
+
42
+ You must investigate the root cause, apply business rules, make a decision, and close the case.
43
+
44
+ **Your action space** (respond with exactly ONE JSON action per turn):
45
+
46
+ 1. inspect_field: {"type": "inspect_field", "params": {"document": "invoice|po|grn|supplier_master", "field": "field_name"}}
47
+ 2. cross_check: {"type": "cross_check", "params": {"field": "field_name", "doc_a": "doc1", "doc_b": "doc2"}}
48
+ 3. run_check: {"type": "run_check", "params": {"check_name": "check_name"}}
49
+ 4. query_supplier: {"type": "query_supplier", "params": {"question": "your question", "channel": "phone|email"}}
50
+ 5. query_internal: {"type": "query_internal", "params": {"department": "dept_name", "question": "your question"}}
51
+ 6. apply_rule: {"type": "apply_rule", "params": {"rule_id": "rule_id"}}
52
+ 7. make_decision: {"type": "make_decision", "params": {"decision": "approve|reject|hold|partial_approve", "reason": "explanation"}}
53
+ 8. route_to: {"type": "route_to", "params": {"team": "team_name", "notes": "routing notes"}}
54
+ 9. close_case: {"type": "close_case", "params": {"summary": "audit trail summary"}}
55
+
56
+ **Rules:**
57
+ - Always investigate before making a decision
58
+ - Never approve without running checks first
59
+ - If fraud is suspected, NEVER contact the supplier via email — use phone only
60
+ - Respond with ONLY a JSON object, no extra text
61
+ """
62
+
63
+
64
+ # ---------------------------------------------------------------------------
65
+ # Prompt builder
66
+ # ---------------------------------------------------------------------------
67
+
68
+ def build_prompt(obs, step: int, max_steps: int, history: list) -> str:
69
+ """Build the user prompt from the current observation state."""
70
+ lines = [
71
+ f"Step {step} of {max_steps}.",
72
+ f"",
73
+ f"EXCEPTION FLAG: {obs.exception_flag.flag_code} — {obs.exception_flag.flag_description}",
74
+ f"",
75
+ f"Available checks: {', '.join(obs.available_checks)}",
76
+ f"Available rules: {', '.join(obs.available_rules)}",
77
+ f"",
78
+ f"Knowledge base:",
79
+ ]
80
+ for entry in obs.knowledge_base:
81
+ lines.append(f" - {entry}")
82
+
83
+ lines.append("")
84
+ lines.append(f"Cumulative reward so far: {obs.cumulative_reward:.2f}")
85
+ lines.append(f"Case status: {obs.case_status}")
86
+
87
+ if obs.checks_run:
88
+ lines.append(f"Checks already run: {', '.join(c.check_name for c in obs.checks_run)}")
89
+ if obs.queries:
90
+ lines.append(f"Queries made: {', '.join(q.target for q in obs.queries)}")
91
+ if obs.inspections:
92
+ lines.append(f"Fields inspected: {', '.join(f'{i.document}.{i.field}' for i in obs.inspections)}")
93
+ if obs.rules_applied:
94
+ lines.append(f"Rules applied: {', '.join(obs.rules_applied)}")
95
+ if obs.decision:
96
+ lines.append(f"Decision made: {obs.decision}")
97
+ if obs.routed_to:
98
+ lines.append(f"Routed to: {', '.join(obs.routed_to)}")
99
+
100
+ if history:
101
+ lines.append("")
102
+ lines.append("Recent history:")
103
+ for h in history[-5:]:
104
+ lines.append(f" {h}")
105
+
106
+ lines.append("")
107
+ lines.append("What is your next action? Respond with a single JSON object.")
108
+
109
+ return "\n".join(lines)
110
+
111
+
112
+ # ---------------------------------------------------------------------------
113
+ # LLM caller
114
+ # ---------------------------------------------------------------------------
115
+
116
+ def call_llm(client: OpenAI, user_prompt: str) -> str:
117
+ """Call the LLM and return its raw text response."""
118
+ try:
119
+ response = client.chat.completions.create(
120
+ model=MODEL_NAME,
121
+ messages=[
122
+ {"role": "system", "content": SYSTEM_PROMPT},
123
+ {"role": "user", "content": user_prompt},
124
+ ],
125
+ temperature=0.1,
126
+ max_tokens=256,
127
+ )
128
+ return response.choices[0].message.content or ""
129
+ except Exception as e:
130
+ print(f"LLM call failed: {e}", file=sys.stderr)
131
+ return '{"type": "run_check", "params": {"check_name": "po_match"}}'
132
+
133
+
134
+ # ---------------------------------------------------------------------------
135
+ # Action parser
136
+ # ---------------------------------------------------------------------------
137
+
138
+ def parse_action(raw_text: str) -> dict:
139
+ """
140
+ Parse the model's response into an action dict.
141
+ Handles markdown code fences, extra whitespace, and minor formatting errors.
142
+ Falls back to run_check(po_match) if parsing fails.
143
+ """
144
+ text = raw_text.strip()
145
+
146
+ # Remove ```json or ``` fences if present
147
+ if text.startswith("```"):
148
+ lines = text.split("\n")
149
+ text = "\n".join(lines[1:-1] if lines[-1].strip() == "```" else lines[1:])
150
+
151
+ try:
152
+ return json.loads(text.strip())
153
+ except json.JSONDecodeError:
154
+ pass
155
+
156
+ # Try to find JSON within the text
157
+ match = re.search(r'\{.*\}', text, re.DOTALL)
158
+ if match:
159
+ try:
160
+ return json.loads(match.group())
161
+ except json.JSONDecodeError:
162
+ pass
163
+
164
+ # Safe fallback
165
+ return {"type": "run_check", "params": {"check_name": "po_match"}}
166
+
167
+
168
+ # ---------------------------------------------------------------------------
169
+ # Task runner
170
+ # ---------------------------------------------------------------------------
171
+
172
+ def run_task(client: OpenAI, env: InvoiceExceptionEnv, task_id: str, max_steps: int = 20) -> tuple:
173
+ """Run one task episode and return (steps_taken, score, rewards)."""
174
+ rewards = []
175
+
176
+ print(f"[START] task={task_id} env=invoice-exception-handler model={MODEL_NAME}", flush=True)
177
+
178
+ obs = env.reset(task_id)
179
+ history = []
180
+
181
+ for step in range(1, max_steps + 1):
182
+ # Build prompt from observation
183
+ user_prompt = build_prompt(obs, step, max_steps, history)
184
+
185
+ # Call LLM
186
+ raw = call_llm(client, user_prompt)
187
+ action_dict = parse_action(raw)
188
+
189
+ # Execute
190
+ try:
191
+ result = env.step(action_dict)
192
+ reward = result.reward
193
+ done = result.done
194
+ error = None
195
+ except Exception as e:
196
+ reward = 0.0
197
+ done = False
198
+ error = str(e)
199
+ result = None
200
+
201
+ rewards.append(reward)
202
+ action_str = json.dumps(action_dict)
203
+
204
+ print(
205
+ f"[STEP] step={step} action={action_str} "
206
+ f"reward={reward:.2f} done={str(done).lower()} "
207
+ f"error={error or 'null'}",
208
+ flush=True,
209
+ )
210
+
211
+ history.append(f"Step {step}: {action_str} -> reward {reward:+.2f}")
212
+
213
+ if result:
214
+ obs = result.observation
215
+
216
+ if done:
217
+ break
218
+
219
+ score = env.grade()["score"]
220
+ success = score >= 0.5
221
+ steps_taken = min(step, max_steps)
222
+ rewards_str = ",".join(f"{r:.2f}" for r in rewards)
223
+
224
+ print(
225
+ f"[END] success={str(success).lower()} steps={steps_taken} "
226
+ f"score={score:.3f} rewards={rewards_str}",
227
+ flush=True,
228
+ )
229
+
230
+ return steps_taken, score, rewards
231
+
232
+
233
+ # ---------------------------------------------------------------------------
234
+ # Main
235
+ # ---------------------------------------------------------------------------
236
+
237
+ def main() -> None:
238
+ """Run inference on all tasks."""
239
+ client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
240
+ env = InvoiceExceptionEnv(seed=42)
241
+
242
+ all_scores = []
243
+ for task_id in ALL_TASKS:
244
+ _, score, _ = run_task(client, env, task_id)
245
+ all_scores.append(score)
246
+
247
+ avg = sum(all_scores) / len(all_scores) if all_scores else 0.0
248
+ print(f"\nAverage score: {avg:.3f}", flush=True)
249
+
250
+
251
+ if __name__ == "__main__":
252
+ main()
openenv.yaml ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # openenv.yaml
2
+ name: Invoice Exception Handler
3
+ version: "1.0.0"
4
+ description: |
5
+ An agent learning environment simulating accounts payable exception handling.
6
+ The agent acts as an AP analyst: investigates flagged invoices, applies business
7
+ rules, detects fraud signals, makes decisions, and closes cases with an audit trail.
8
+
9
+ authors:
10
+ - name: Moahmmed Yusuf, Nadella Harshith
11
+ email: [yusufindian09@gmail.com] [nadellaharshith4@gmail.com]
12
+
13
+ license: MIT
14
+
15
+ tasks:
16
+ - id: task1_price_variance
17
+ name: Price Variance Exception
18
+ difficulty: easy
19
+ description: |
20
+ Office stationery invoice arrives 3.08% above PO. Company tolerance policy
21
+ allows +/-2% auto-approval. Agent must detect the variance, verify through
22
+ the tolerance rule, confirm verbal approval with procurement, and approve
23
+ with a PO amendment request.
24
+ max_steps: 18
25
+ optimal_score: 1.0
26
+ min_passing_score: 0.60
27
+
28
+ - id: task2_duplicate_tax
29
+ name: Duplicate Invoice with Tax Error
30
+ difficulty: medium
31
+ description: |
32
+ Logistics supplier submits INV-2024-891, a duplicate of paid INV-2024-819
33
+ (digit transposition: 891 vs 819). Original invoice had wrong GST rate (15%
34
+ vs correct 18%) — company overpaid 3,240 INR. New invoice has correct rate.
35
+ Agent must detect the duplicate, identify the tax error in the original,
36
+ and partially approve only the 3,240 INR tax correction.
37
+ max_steps: 20
38
+ optimal_score: 1.0
39
+ min_passing_score: 0.50
40
+
41
+ - id: task3_compound_fraud
42
+ name: Compound Fraud Signals
43
+ difficulty: hard
44
+ description: |
45
+ IT equipment supplier invoice with four simultaneous fraud signals: bank
46
+ account changed via BEC attack (lookalike email domain), GSTIN belongs to
47
+ a different entity, 2 of 15 laptops not yet received, and unit price 8.65%
48
+ above PO. Agent must find all signals, use the correct communication channel
49
+ (phone, not email — which would contact the fraudster), and escalate to legal
50
+ and security.
51
+ max_steps: 25
52
+ optimal_score: 1.0
53
+ min_passing_score: 0.40
54
+
55
+ observation_space:
56
+ type: object
57
+ description: EnvironmentState Pydantic model
58
+ fields:
59
+ task_id: {type: string}
60
+ step_number: {type: integer}
61
+ case_status: {type: string, enum: [open, in_review, decided, routed, closed]}
62
+ purchase_order: {type: object, description: "PO with line items and terms"}
63
+ invoice: {type: object, description: "Supplier invoice with line items and tax"}
64
+ grn: {type: object, description: "Goods receipt — what actually arrived"}
65
+ supplier_master: {type: object, description: "Verified supplier record"}
66
+ exception_flag: {type: object, description: "Why the system flagged this invoice"}
67
+ inspections: {type: array, description: "Fields the agent has inspected"}
68
+ checks_run: {type: array, description: "Validation checks completed"}
69
+ queries: {type: array, description: "Internal and supplier queries"}
70
+ rules_applied: {type: array, description: "Business rules applied"}
71
+ decision: {type: string, nullable: true}
72
+ routed_to: {type: array}
73
+ available_actions: {type: array}
74
+ available_checks: {type: array}
75
+ available_rules: {type: array}
76
+ knowledge_base: {type: array}
77
+ cumulative_reward: {type: number}
78
+
79
+ action_space:
80
+ type: object
81
+ description: Action with type and params
82
+ actions:
83
+ inspect_field:
84
+ params: {document: string, field: string}
85
+ cross_check:
86
+ params: {field: string, doc_a: string, doc_b: string}
87
+ run_check:
88
+ params: {check_name: string}
89
+ query_supplier:
90
+ params: {question: string, channel: string}
91
+ query_internal:
92
+ params: {department: string, question: string}
93
+ apply_rule:
94
+ params: {rule_id: string}
95
+ make_decision:
96
+ params: {decision: string, reason: string}
97
+ route_to:
98
+ params: {team: string, notes: string}
99
+ close_case:
100
+ params: {summary: string}
101
+
102
+ reward:
103
+ range: [-1.0, 1.0]
104
+ description: |
105
+ Shaped reward at every step. Relevant inspections: +0.01 to +0.14.
106
+ Diagnostics revealing issues: +0.08 to +0.18. Correct fixes: +0.08 to +0.30.
107
+ Wrong decision on fraud: -0.15 to -0.40. Repeat actions: -0.02 to -0.05.
108
+ SLA breach: -0.10.
109
+
110
+ grading:
111
+ method: task_grader
112
+ scores:
113
+ - score
114
+ - diagnosis_score
115
+ - investigation_score
116
+ - decision_score
117
+ - routing_score
118
+ - closure_score
119
+ - efficiency_score
120
+
121
+ api:
122
+ reset:
123
+ signature: "reset(task_id: str | None = None) -> EnvironmentState"
124
+ step:
125
+ signature: "step(action: Action | dict) -> StepResult"
126
+ state:
127
+ signature: "state() -> EnvironmentState"
128
+ grade:
129
+ signature: "grade() -> Dict[str, float]"
130
+
131
+ http_endpoints:
132
+ - path: /reset
133
+ method: POST
134
+ description: Reset environment, returns EnvironmentState JSON
135
+ - path: /step
136
+ method: POST
137
+ description: Execute action, returns StepResult JSON
138
+ - path: /state
139
+ method: GET
140
+ description: Current state, returns EnvironmentState JSON
141
+ - path: /grade
142
+ method: POST
143
+ description: Grade current episode
144
+ - path: /health
145
+ method: GET
146
+ description: Health check
147
+
148
+ dependencies:
149
+ python: ">=3.10"
150
+ packages:
151
+ - pydantic>=2.7
152
+ - fastapi>=0.111
153
+ - uvicorn>=0.29
154
+ - gradio>=4.36
155
+ - openai>=1.35
156
+ - pyyaml>=6.0
157
+
158
+ docker:
159
+ port: 7860
160
+ health_check: /health
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ pydantic==2.11.1
2
+ fastapi==0.115.12
3
+ uvicorn==0.34.2
4
+ gradio==5.23.3
5
+ openai==1.75.0
6
+ pyyaml==6.0.2
7
+ httpx==0.28.1
8
+ python-multipart==0.0.20
9
+ openenv-core==0.1.0