Spaces:

YUS200619
/

invoice-exception-handler

Sleeping

App Files Files Community

YUS200619 commited on 30 days ago

Commit

562f58d

1 Parent(s): fbe9c8c

feat: complete invoice exception handler v1.0.0

Browse files

Files changed (15) hide show

.gitignore +30 -0
Dockerfile +32 -0
app.py +405 -0
documents/ARCHITECTURE.md +56 -0
documents/CHANGELOG.md +23 -0
documents/PRD-001-product-requirements.md +605 -0
documents/PRD.md +605 -0
documents/README.md +1610 -0
env/__init__.py +19 -0
env/environment.py +339 -0
env/models.py +276 -0
env/tasks.py +984 -0
inference.py +252 -0
openenv.yaml +160 -0
requirements.txt +9 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,30 @@

+# Byte-compiled
+__pycache__/
+*.py[cod]
+*$py.class
+# Virtual env
+venv/
+.venv/
+env_venv/
+# IDE
+.vscode/
+.idea/
+*.swp
+# OS
+.DS_Store
+Thumbs.db
+# Secrets
+.env
+*.env
+# Test files
+test_smoke.py
+# Misc
+*.egg-info/
+dist/
+build/

Dockerfile ADDED Viewed

	@@ -0,0 +1,32 @@

+FROM python:3.11-slim
+# Install system dependencies
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends curl \
+    && rm -rf /var/lib/apt/lists/*
+# Create non-root user (required by HF Spaces)
+RUN useradd -m -u 1000 appuser
+WORKDIR /app
+# Copy and install dependencies first (layer caching)
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy application code
+COPY --chown=appuser:appuser . .
+USER appuser
+EXPOSE 7860
+# Health check — pings the /health endpoint
+HEALTHCHECK --interval=30s --timeout=10s --start-period=20s --retries=3 \
+    CMD curl -f http://localhost:7860/health || exit 1
+ENV PYTHONUNBUFFERED=1
+ENV GRADIO_SERVER_NAME=0.0.0.0
+ENV GRADIO_SERVER_PORT=7860
+CMD ["python", "app.py"]

app.py ADDED Viewed

	@@ -0,0 +1,405 @@

+"""
+Combined FastAPI + Gradio application for the Invoice Exception Handler.
+Serves both the HTTP API endpoints (for the OpenEnv validator) and an
+interactive Gradio UI (for judges and exploration) on port 7860.
+"""
+from __future__ import annotations
+import json
+from typing import Any, Dict, Optional
+import gradio as gr
+import uvicorn
+from fastapi import FastAPI
+from fastapi.responses import JSONResponse
+from env import InvoiceExceptionEnv, Action, ActionType, ALL_TASKS
+# ---------------------------------------------------------------------------
+# Shared environment instance
+# ---------------------------------------------------------------------------
+env = InvoiceExceptionEnv(seed=42)
+# ---------------------------------------------------------------------------
+# FastAPI server
+# ---------------------------------------------------------------------------
+api = FastAPI(title="Invoice Exception Handler OpenEnv", version="1.0.0")
+@api.post("/reset")
+async def http_reset(body: dict = {}) -> JSONResponse:
+    """Reset the environment. Optionally specify task_id."""
+    task_id = body.get("task_id", None)
+    obs = env.reset(task_id)
+    return JSONResponse(obs.model_dump(mode="json"))
+@api.post("/step")
+async def http_step(body: dict = {}) -> JSONResponse:
+    """Execute one action."""
+    result = env.step(body)
+    return JSONResponse(result.model_dump(mode="json"))
+@api.get("/state")
+async def http_state() -> JSONResponse:
+    """Return the current state without advancing."""
+    return JSONResponse(env.state().model_dump(mode="json"))
+@api.post("/grade")
+async def http_grade() -> JSONResponse:
+    """Grade the current episode."""
+    return JSONResponse(env.grade())
+@api.get("/tasks")
+async def http_tasks() -> JSONResponse:
+    """List available tasks."""
+    return JSONResponse(ALL_TASKS)
+@api.get("/health")
+async def health() -> JSONResponse:
+    """Health check endpoint."""
+    return JSONResponse({"status": "ok", "version": "1.0.0"})
+# ---------------------------------------------------------------------------
+# Gradio UI — environment for interactive play
+# ---------------------------------------------------------------------------
+# Per-session environment for the Gradio UI (separate from the API env)
+ui_env = InvoiceExceptionEnv(seed=42)
+ui_history: list = []
+def reset_task(task_name: str) -> tuple:
+    """Reset the environment with the selected task."""
+    global ui_history
+    ui_history = []
+    task_map = {
+        "Task 1 — Price Variance (Easy)": "task1_price_variance",
+        "Task 2 — Duplicate Tax (Medium)": "task2_duplicate_tax",
+        "Task 3 — Compound Fraud (Hard)": "task3_compound_fraud",
+    }
+    task_id = task_map.get(task_name, "task1_price_variance")
+    obs = ui_env.reset(task_id)
+    flag_text = f"**{obs.exception_flag.flag_code}**: {obs.exception_flag.flag_description}"
+    checks_text = ", ".join(obs.available_checks)
+    rules_text = ", ".join(obs.available_rules)
+    kb_text = "\n".join(f"- {entry}" for entry in obs.knowledge_base)
+    status_text = f"Step: {obs.step_number} | Status: {obs.case_status.value} | Reward: {obs.cumulative_reward:.2f}"
+    return flag_text, checks_text, rules_text, kb_text, status_text, "", ""
+def execute_action(action_type: str, param1: str, param2: str, param3: str) -> tuple:
+    """Execute a single action and return updated state."""
+    global ui_history
+    params: Dict[str, Any] = {}
+    if action_type == "inspect_field":
+        params = {"document": param1, "field": param2}
+    elif action_type == "cross_check":
+        params = {"field": param1, "doc_a": param2, "doc_b": param3}
+    elif action_type == "run_check":
+        params = {"check_name": param1}
+    elif action_type == "query_supplier":
+        params = {"question": param1, "channel": param2 or "phone"}
+    elif action_type == "query_internal":
+        params = {"department": param1, "question": param2}
+    elif action_type == "apply_rule":
+        params = {"rule_id": param1}
+    elif action_type == "make_decision":
+        params = {"decision": param1, "reason": param2}
+    elif action_type == "route_to":
+        params = {"team": param1, "notes": param2}
+    elif action_type == "close_case":
+        params = {"summary": param1}
+    try:
+        result = ui_env.step({"type": action_type, "params": params})
+        reward_text = f"**Reward:** {result.reward:+.2f}"
+        info_text = json.dumps(result.info, indent=2, default=str)
+        obs = result.observation
+        status_text = (
+            f"Step: {obs.step_number} | Status: {obs.case_status.value} | "
+            f"Reward: {obs.cumulative_reward:.2f} | Done: {result.done}"
+        )
+        ui_history.append(f"Step {obs.step_number}: {action_type}({param1}) → {result.reward:+.2f}")
+        history_text = "\n".join(ui_history)
+        grade_text = ""
+        if result.done:
+            scores = ui_env.grade()
+            grade_lines = [f"**Final Grade: {scores['score']:.4f}**", ""]
+            for k, v in scores.items():
+                if k != "score":
+                    grade_lines.append(f"- {k}: {v}")
+            grade_text = "\n".join(grade_lines)
+        return reward_text, status_text, history_text, info_text, grade_text
+    except Exception as e:
+        return f"**Error:** {str(e)}", "", "\n".join(ui_history), "", ""
+def run_demo(task_name: str) -> str:
+    """Run a hardcoded optimal sequence and show step-by-step results."""
+    task_map = {
+        "Task 1 — Price Variance (Easy)": "task1_price_variance",
+        "Task 2 — Duplicate Tax (Medium)": "task2_duplicate_tax",
+        "Task 3 — Compound Fraud (Hard)": "task3_compound_fraud",
+    }
+    task_id = task_map.get(task_name, "task1_price_variance")
+    # Optimal action sequences for each task
+    sequences = {
+        "task1_price_variance": [
+            Action.run_check("po_match"),
+            Action.run_check("tolerance_rule"),
+            Action.cross_check("unit_price", "invoice", "po"),
+            Action.run_check("grn_match"),
+            Action.query_supplier("Why do prices differ from PO?", "email"),
+            Action.query_internal("procurement", "Did you approve the price increase?"),
+            Action.apply_rule("tolerance_exception_approval"),
+            Action.make_decision("approve", "Price increase verbally approved by procurement. PO amendment pending."),
+            Action.route_to("procurement", "Please raise PO amendment for the price variance."),
+            Action.close_case("Invoice approved. Procurement confirmed verbal approval. PO amendment requested."),
+        ],
+        "task2_duplicate_tax": [
+            Action.run_check("duplicate_detection"),
+            Action.inspect_field("invoice", "invoice_number"),
+            Action.run_check("tax_calculation_verify"),
+            Action.cross_check("tax_amount", "invoice", "payment_history"),
+            Action.query_internal("finance", "Can you confirm the overpayment on INV-2024-819?"),
+            Action.query_supplier("Please clarify the relationship between INV-2024-891 and INV-2024-819.", "email"),
+            Action.apply_rule("partial_approval"),
+            Action.apply_rule("credit_note_request"),
+            Action.make_decision("partial_approve", "Duplicate detected. Tax error on original. Approve only 3,240 INR correction."),
+            Action.route_to("finance", "Process 3,240 INR tax correction entry."),
+            Action.close_case("Duplicate invoice with tax correction. Partial approval for delta only."),
+        ],
+        "task3_compound_fraud": [
+            Action.inspect_field("invoice", "bank_account"),
+            Action.run_check("bank_account_verification"),
+            Action.run_check("email_domain_verification"),
+            Action.inspect_field("invoice", "supplier_gstin"),
+            Action.run_check("gst_verification"),
+            Action.inspect_field("grn", "items_received"),
+            Action.run_check("grn_match"),
+            Action.run_check("price_check"),
+            Action.query_supplier("Please confirm your bank details and recent invoices.", "phone"),
+            Action.query_internal("security", "Suspected BEC attack — lookalike domain detected."),
+            Action.apply_rule("fraud_hold"),
+            Action.make_decision("reject", "Four fraud signals: bank BEC, GSTIN mismatch, quantity mismatch, price inflation."),
+            Action.route_to("legal", "Initiate supplier audit and fraud investigation."),
+            Action.route_to("security", "BEC investigation — lookalike domain techcore-solutions.com."),
+            Action.close_case("Fraud detected. Invoice rejected. Legal and security notified."),
+        ],
+    }
+    demo_env = InvoiceExceptionEnv(seed=42)
+    obs = demo_env.reset(task_id)
+    actions = sequences.get(task_id, [])
+    lines = [f"# Demo: {task_name}", f"**Flag:** {obs.exception_flag.flag_description}", ""]
+    for idx, action in enumerate(actions, 1):
+        try:
+            result = demo_env.step(action)
+            action_desc = f"{action.type.value}({json.dumps(action.params)})"
+            lines.append(f"**Step {idx}:** `{action_desc}`")
+            lines.append(f"  Reward: {result.reward:+.2f} | Cumulative: {result.observation.cumulative_reward:.2f}")
+            if result.info.get("result"):
+                detail = result.info["result"].get("detail", result.info["result"].get("value", ""))
+                if detail:
+                    lines.append(f"  → {str(detail)[:120]}")
+            elif result.info.get("detail"):
+                lines.append(f"  → {str(result.info['detail'])[:120]}")
+            lines.append("")
+            if result.done:
+                break
+        except Exception as e:
+            lines.append(f"  Error: {e}")
+            lines.append("")
+    scores = demo_env.grade()
+    lines.append("---")
+    lines.append(f"## Final Score: {scores['score']:.4f}")
+    for k, v in scores.items():
+        if k != "score" and k != "signals_found":
+            lines.append(f"- {k}: {v}")
+    if "signals_found" in scores:
+        lines.append(f"- signals_found: {scores['signals_found']}")
+    return "\n".join(lines)
+def build_gradio_ui() -> gr.Blocks:
+    """Build the three-tab Gradio interface."""
+    with gr.Blocks(
+        title="Invoice Exception Handler — OpenEnv",
+        theme=gr.themes.Soft(),
+    ) as demo:
+        gr.Markdown("# 🧾 Invoice Exception Handler — OpenEnv")
+        gr.Markdown("An AI agent learning environment for accounts payable exception handling.")
+        with gr.Tabs():
+            # ----- Tab 1: Manual Play -----
+            with gr.TabItem("🎮 Manual Play"):
+                with gr.Row():
+                    task_dropdown = gr.Dropdown(
+                        choices=[
+                            "Task 1 — Price Variance (Easy)",
+                            "Task 2 — Duplicate Tax (Medium)",
+                            "Task 3 — Compound Fraud (Hard)",
+                        ],
+                        value="Task 1 — Price Variance (Easy)",
+                        label="Select Task",
+                    )
+                    reset_btn = gr.Button("🔄 Reset", variant="primary")
+                flag_display = gr.Markdown(label="Exception Flag")
+                with gr.Row():
+                    checks_display = gr.Textbox(label="Available Checks", interactive=False)
+                    rules_display = gr.Textbox(label="Available Rules", interactive=False)
+                kb_display = gr.Markdown(label="Knowledge Base")
+                status_display = gr.Textbox(label="Status", interactive=False)
+                gr.Markdown("### Take an Action")
+                with gr.Row():
+                    action_type_input = gr.Dropdown(
+                        choices=[at.value for at in ActionType],
+                        value="run_check",
+                        label="Action Type",
+                    )
+                    param1_input = gr.Textbox(label="Param 1 (check_name / document / field / question / decision / team / summary)")
+                    param2_input = gr.Textbox(label="Param 2 (field / channel / department / reason / notes)")
+                    param3_input = gr.Textbox(label="Param 3 (doc_b, if cross_check)")
+                action_btn = gr.Button("▶️ Execute Action", variant="primary")
+                reward_display = gr.Markdown(label="Reward")
+                action_info = gr.Textbox(label="Action Info (JSON)", lines=4, interactive=False)
+                history_display = gr.Textbox(label="Action History", lines=8, interactive=False)
+                grade_display = gr.Markdown(label="Grade (shown when episode ends)")
+                reset_btn.click(
+                    reset_task,
+                    inputs=[task_dropdown],
+                    outputs=[flag_display, checks_display, rules_display,
+                             kb_display, status_display, history_display, grade_display],
+                )
+                action_btn.click(
+                    execute_action,
+                    inputs=[action_type_input, param1_input, param2_input, param3_input],
+                    outputs=[reward_display, status_display, history_display,
+                             action_info, grade_display],
+                )
+            # ----- Tab 2: Agent Demo -----
+            with gr.TabItem("🤖 Agent Demo"):
+                gr.Markdown("Watch a hardcoded optimal agent solve each task step by step.")
+                demo_task = gr.Dropdown(
+                    choices=[
+                        "Task 1 — Price Variance (Easy)",
+                        "Task 2 — Duplicate Tax (Medium)",
+                        "Task 3 — Compound Fraud (Hard)",
+                    ],
+                    value="Task 1 — Price Variance (Easy)",
+                    label="Select Task",
+                )
+                demo_btn = gr.Button("▶️ Run Demo", variant="primary")
+                demo_output = gr.Markdown()
+                demo_btn.click(run_demo, inputs=[demo_task], outputs=[demo_output])
+            # ----- Tab 3: API Reference -----
+            with gr.TabItem("📖 API Reference"):
+                gr.Markdown("""
+## Action Types
+| Action | Params | Description |
+|--------|--------|-------------|
+| `inspect_field` | `document, field` | Look at a specific field in a document |
+| `cross_check` | `field, doc_a, doc_b` | Compare a field between two documents |
+| `run_check` | `check_name` | Run a named validation check |
+| `query_supplier` | `question, channel` | Ask the supplier (channel: phone or email) |
+| `query_internal` | `department, question` | Ask an internal team |
+| `apply_rule` | `rule_id` | Apply a business policy rule |
+| `make_decision` | `decision, reason` | approve / reject / hold / partial_approve |
+| `route_to` | `team, notes` | Escalate to a team |
+| `close_case` | `summary` | Close with an audit trail summary |
+## Reward Ranges
+| Event | Reward |
+|-------|--------|
+| Inspecting a key field | +0.01 to +0.14 |
+| Cross-check finds mismatch | +0.12 to +0.15 |
+| Running a diagnostic check | +0.08 to +0.18 |
+| Correct decision | +0.18 to +0.28 |
+| Wrong decision on fraud | −0.35 to −0.40 |
+| Contacting supplier via email (fraud) | −0.15 |
+| Repeat action | −0.02 to −0.05 |
+| SLA breach | −0.10 |
+## HTTP API
+```
+POST /reset    — Body: {"task_id": "task1_price_variance"} → EnvironmentState
+POST /step     — Body: {"type": "run_check", "params": {"check_name": "..."}} → StepResult
+GET  /state    → EnvironmentState
+POST /grade    → {"score": 0.85, ...}
+GET  /tasks    → ["task1_price_variance", ...]
+GET  /health   → {"status": "ok"}
+```
+## Grader Sub-Scores
+Each task grader returns:
+- **score** — overall 0.0–1.0
+- **diagnosis_score** — did the agent find the root cause?
+- **investigation_score** — did the agent gather evidence properly?
+- **decision_score** — was the decision correct?
+- **routing_score** — was the case sent to the right team?
+- **closure_score** — was the case closed with a summary?
+- **efficiency_score** — bonus for not wasting steps
+                """)
+    return demo
+# ---------------------------------------------------------------------------
+# Main — mount Gradio on FastAPI and serve
+# ---------------------------------------------------------------------------
+gradio_app = build_gradio_ui()
+app = gr.mount_gradio_app(api, gradio_app, path="/")
+if __name__ == "__main__":
+    import signal
+    import sys
+    def handle_sigint(sig, frame):
+        """Graceful shutdown on Ctrl+C."""
+        print("\nShutting down gracefully...")
+        sys.exit(0)
+    signal.signal(signal.SIGINT, handle_sigint)
+    try:
+        uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)
+    except (KeyboardInterrupt, SystemExit):
+        pass

documents/ARCHITECTURE.md ADDED Viewed

	@@ -0,0 +1,56 @@

+# Architecture
+## System Overview
+```
+┌──────────────────────────────────────────────────────────────┐
+│                     HF Space / Docker Container              │
+│                                                              │
+│  ┌──────────────┐    ┌──────────────────────────────────┐    │
+│  │  Gradio UI   │    │         FastAPI Server           │    │
+│  │  (port 7860) │    │  POST /reset  GET /state         │    │
+│  │              │    │  POST /step   GET /health        │    │
+│  └──────┬───────┘    └──────────────┬───────────────────┘    │
+│         │                           │                        │
+│         └──────────┬────────────────┘                        │
+│                    │                                         │
+│         ┌──────────▼──────────────┐                          │
+│         │   InvoiceExceptionEnv   │                          │
+│         │  reset() step() state() │                          │
+│         │  grade()                │                          │
+│         └──────────┬──────────────┘                          │
+│                    │                                         │
+│         ┌──────────▼──────────────┐                          │
+│         │      Task Registry      │                          │
+│         │  task1_price_variance   │                          │
+│         │  task2_duplicate_tax    │                          │
+│         │  task3_compound_fraud   │                          │
+│         └─────────────────────────┘                          │
+└─────────────────────────────────────────────────────────────┘
+```
+## Key Design Decisions
+### FastAPI + Gradio in same process
+HF Spaces requires a single port (7860). Gradio is mounted on FastAPI using
+`gr.mount_gradio_app()` so both the validator API and the interactive UI
+share the same process and port.
+### Pydantic v2 for all models
+Required by the OpenEnv spec. Every field is typed. No `Any` fields without
+explicit documentation of why.
+### EpisodeData vs EnvironmentState
+- **EpisodeData** is mutable internal state tracking what the agent has done
+- **EnvironmentState** is the immutable snapshot returned to the agent
+- Documents (PO, Invoice, GRN) are rebuilt from task factories each time,
+  ensuring they are never accidentally mutated
+### Separate task classes
+Each task is a self-contained class with its own documents, simulators, and
+grader. This makes it trivial to add new tasks — just implement BaseTask and
+register in TASK_REGISTRY.
+### Deterministic simulation
+No randomness in simulators or graders. Same seed + same actions = same scores.
+The only randomness is in `action_space_sample()` for baseline agents.

documents/CHANGELOG.md ADDED Viewed

	@@ -0,0 +1,23 @@

+# Changelog
+All changes to the Invoice Exception Handler environment are recorded here.
+---
+## [1.0.0] — 2025-01-20
+### Added
+- Initial implementation of InvoiceExceptionEnv with full OpenEnv API
+- Three tasks: task1_price_variance, task2_duplicate_tax, task3_compound_fraud
+- Pydantic v2 typed models for all environment objects
+- FastAPI HTTP endpoints for HF Spaces validation
+- Gradio UI for interactive exploration
+- inference.py using OpenAI client with [START][STEP][END] log format
+- openenv.yaml spec file
+- Dockerfile for HF Spaces deployment
+### Design decisions
+- Used pure Python simulation (no external databases) for portability and determinism
+- Compound fraud task has four signals to prevent simple greedy agents from scoring well
+- Channel selection in Task 3 (phone vs email) tests policy knowledge, not just anomaly detection
+- Grader uses sub-scores to allow partial credit for partial solutions

documents/PRD-001-product-requirements.md ADDED Viewed

	@@ -0,0 +1,605 @@

+# Product Requirements Document
+## Invoice Exception Handler — OpenEnv Agent Learning Environment
+**Document ID:** PRD-001
+**Version:** 1.0.0
+**Status:** Final
+**Author:** Mohammed Yusuf Ahmed
+**Last Updated:** 2025-01-20
+**Classification:** Internal / Hackathon Submission
+---
+## Table of Contents
+1. [Executive Summary](#1-executive-summary)
+2. [Problem Statement](#2-problem-statement)
+3. [Product Vision](#3-product-vision)
+4. [Stakeholders](#4-stakeholders)
+5. [Functional Requirements](#5-functional-requirements)
+6. [Non-Functional Requirements](#6-non-functional-requirements)
+7. [System Architecture](#7-system-architecture)
+8. [Task Specifications](#8-task-specifications)
+9. [Reward Design](#9-reward-design)
+10. [Evaluation Criteria](#10-evaluation-criteria)
+11. [API Contract](#11-api-contract)
+12. [File Structure](#12-file-structure)
+13. [Out of Scope](#13-out-of-scope)
+14. [Change Log](#14-change-log)
+---
+## 1. Executive Summary
+The Invoice Exception Handler is a real-world agent learning environment built for the OpenEnv standard. It simulates the accounts payable (AP) exception handling workflow that every business on earth runs daily — the process of investigating flagged invoices before payment is approved.
+The environment places an AI agent in the role of an AP analyst. The agent receives a document packet (Purchase Order, Invoice, Goods Receipt Note, Supplier Master), reads an exception flag, and must investigate the root cause, make a decision, route the case to the right team, and close it cleanly. Every action has realistic financial and compliance consequences.
+The environment ships with three tasks of increasing difficulty — price variance (easy), duplicate with hidden tax error (medium), and compound fraud with four simultaneous signals (hard).
+---
+## 2. Problem Statement
+### 2.1 The Real-World Pain
+Every company that buys goods or services from suppliers receives invoices. Typically 5–15% of all invoices have exceptions — discrepancies between what was ordered (PO), what was received (GRN), and what was invoiced. These exceptions are currently handled by accounts payable clerks who manually:
+1. Pull the original Purchase Order
+2. Compare it line by line against the invoice
+3. Check the Goods Receipt Note
+4. Run validation checks
+5. Query internal teams or the supplier
+6. Make a decision (approve / reject / hold / partial approve)
+7. Route the case and document everything
+At a mid-size company this is 2–4 hours of analyst time per day. At enterprise scale it is entire departments. The cost to the AP automation market exceeds $3 billion annually.
+### 2.2 The AI Gap
+No existing OpenEnv benchmark tests an agent's ability to:
+- Reason across multiple documents simultaneously
+- Apply business rules with thresholds and exceptions
+- Detect fraud signals that require cross-referencing
+- Make nuanced decisions (partial approve, hold, escalate)
+- Know *not* to contact a supplier via a potentially compromised channel
+This gap means agents trained on existing benchmarks cannot be evaluated or trained on one of the most common finance workflows in enterprise software.
+### 2.3 What This Environment Fixes
+The Invoice Exception Handler provides:
+- A clean, typed, deterministic simulation of AP exception handling
+- Three tasks that test a progression of reasoning: threshold logic → duplicate detection → multi-signal fraud
+- Shaped rewards that signal progress at every step, not just at episode end
+- A fully deployable environment that conforms to the OpenEnv spec
+---
+## 3. Product Vision
+> An agent that scores well in this environment is demonstrably better at AP exception handling than the average accounts payable clerk — and is ready to be deployed in real enterprise finance workflows.
+The environment is designed so that:
+- The reward signal is meaningful enough to actually train agents on, not just evaluate them
+- The hard task (compound fraud) remains genuinely difficult for frontier models
+- Every score between 0.0 and 1.0 reflects a real quality difference in agent behavior
+---
+## 4. Stakeholders
+| Stakeholder | Role | Interest |
+|---|---|---|
+| Hackathon Judges (Meta, HF engineers) | Evaluators | Real-world utility, code quality, creativity |
+| OpenEnv Automated Validator | Gatekeeper | Spec compliance, deployment health |
+| AI Researchers | Primary users post-submission | Training and evaluating AP agents |
+| Enterprise Software Companies | Secondary users | Evaluating models for AP automation products |
+---
+## 5. Functional Requirements
+### 5.1 Core Environment API
+| Requirement | Priority | Detail |
+|---|---|---|
+| FR-001 | MUST | `env.reset(task_id)` returns a clean `EnvironmentState` |
+| FR-002 | MUST | `env.step(action)` returns `StepResult(observation, reward, done, info)` |
+| FR-003 | MUST | `env.state()` returns current state without advancing episode |
+| FR-004 | MUST | `env.grade()` returns a score dict with overall score 0.0–1.0 |
+| FR-005 | MUST | All models are typed Pydantic v2 with no untyped fields |
+| FR-006 | MUST | `openenv.yaml` passes `openenv validate` |
+### 5.2 HTTP Endpoints (for HF Spaces validator)
+| Requirement | Priority | Detail |
+|---|---|---|
+| FR-007 | MUST | `POST /reset` returns HTTP 200 with JSON observation |
+| FR-008 | MUST | `POST /step` returns HTTP 200 with JSON StepResult |
+| FR-009 | MUST | `GET /state` returns HTTP 200 with JSON EnvironmentState |
+| FR-010 | MUST | `GET /health` returns HTTP 200 `{"status": "ok"}` |
+| FR-011 | SHOULD | `GET /` returns HTML documentation page |
+### 5.3 Task Requirements
+| Requirement | Priority | Detail |
+|---|---|---|
+| FR-012 | MUST | Minimum 3 tasks with distinct scenarios |
+| FR-013 | MUST | Tasks range easy → medium → hard |
+| FR-014 | MUST | Each task has a deterministic grader returning 0.0–1.0 |
+| FR-015 | MUST | Graders have sub-scores (diagnosis, investigation, decision, routing, closure, efficiency) |
+| FR-016 | MUST | Hard task must not be solvable by simple heuristics |
+### 5.4 Reward Function
+| Requirement | Priority | Detail |
+|---|---|---|
+| FR-017 | MUST | Reward is shaped across the full trajectory |
+| FR-018 | MUST | Dangerous actions (approving fraud) produce large negative rewards |
+| FR-019 | MUST | Repeating already-completed actions penalised lightly |
+| FR-020 | MUST | Exceeding step budget penalised (SLA concept) |
+| FR-021 | SHOULD | Efficiency bonus for completing faster than optimal |
+### 5.5 Inference Script
+| Requirement | Priority | Detail |
+|---|---|---|
+| FR-022 | MUST | Script named exactly `inference.py` in root directory |
+| FR-023 | MUST | Uses OpenAI client (not Anthropic SDK) |
+| FR-024 | MUST | Reads `API_BASE_URL`, `MODEL_NAME`, `HF_TOKEN` from environment |
+| FR-025 | MUST | Emits `[START]`, `[STEP]`, `[END]` lines to stdout exactly as spec |
+| FR-026 | MUST | Completes all 3 tasks in under 20 minutes on 2 vCPU / 8 GB RAM |
+| FR-027 | MUST | Produces reproducible scores with the same seed |
+### 5.6 Deployment
+| Requirement | Priority | Detail |
+|---|---|---|
+| FR-028 | MUST | Dockerfile builds cleanly without internet access at run time |
+| FR-029 | MUST | Container starts and serves on port 7860 |
+| FR-030 | MUST | HF Spaces `POST /reset` returns 200 |
+| FR-031 | MUST | README documents setup, action space, observation space, tasks, baseline scores |
+---
+## 6. Non-Functional Requirements
+| ID | Category | Requirement |
+|---|---|---|
+| NFR-001 | Performance | `reset()` completes in < 100ms |
+| NFR-002 | Performance | `step()` completes in < 50ms |
+| NFR-003 | Performance | Full 3-task inference run completes in < 20 minutes |
+| NFR-004 | Resource | Runs on 2 vCPU, 8 GB RAM — no GPU required |
+| NFR-005 | Correctness | Grader output is deterministic — same actions = same score |
+| NFR-006 | Correctness | Reward values are deterministic — no randomness in simulation |
+| NFR-007 | Code quality | No bare `except:` blocks — all exceptions typed |
+| NFR-008 | Code quality | All functions have docstrings |
+| NFR-009 | Code quality | Type hints on all function signatures |
+| NFR-010 | Portability | Zero OS-specific code — runs on Linux (Docker) |
+| NFR-011 | Security | No hardcoded credentials anywhere in code |
+---
+## 7. System Architecture
+```
+┌─────────────────────────────────────────────────────────────┐
+│                     HF Space / Docker Container             │
+│                                                             │
+│  ┌──────────────┐    ┌──────────────────────────────────┐   │
+│  │  Gradio UI   │    │         FastAPI Server            │   │
+│  │  (port 7860) │    │  POST /reset  GET /state          │   │
+│  │              │    │  POST /step   GET /health         │   │
+│  └──────┬───────┘    └──────────────┬───────────────────┘   │
+│         │                           │                        │
+│         └──────────┬────────────────┘                        │
+│                    │                                         │
+│         ┌──────────▼──────────────┐                          │
+│         │   InvoiceExceptionEnv   │                          │
+│         │  reset() step() state() │                          │
+│         │  grade()                │                          │
+│         └──────────┬──────────────┘                          │
+│                    │                                         │
+│         ┌──────────▼──────────────┐                          │
+│         │      Task Registry      │                          │
+│         │  task1_price_variance   │                          │
+│         │  task2_duplicate_tax    │                          │
+│         │  task3_compound_fraud   │                          │
+│         └─────────────────────────┘                          │
+└─────────────────────────────────────────────────────────────┘
+┌─────────────────────────────────────────────────────────────┐
+│                    inference.py (agent)                      │
+│                                                             │
+│  OpenAI Client → env.reset() → loop {                       │
+│    action = LLM(observation_json)                           │
+│    result = env.step(action)                                │
+│    log [STEP]                                               │
+│  } → log [END]                                              │
+└─────────────────────────────────────────────────────────────┘
+```
+### 7.1 Data Flow
+```
+Episode start
+    │
+    ▼
+reset(task_id) ──► builds DocumentPacket + EpisodeData ──► EnvironmentState
+    │
+    ▼
+step(action) ──► dispatch to task simulator ──► (reward, info)
+    │                                               │
+    ▼                                               ▼
+EpisodeData updated ◄──────────────────── append to history
+    │
+    ▼
+new EnvironmentState built ──► StepResult(obs, reward, done, info)
+    │
+    ▼
+grade() ──► EpisodeData ──► grader logic ──► Dict[str, float]
+```
+---
+## 8. Task Specifications
+### 8.1 Task 1 — Price Variance Exception (Easy)
+**Scenario:** Office stationery invoice arrives 3.08% above the PO amount. Company tolerance policy is ±2% for auto-approval. The supplier has a verbal approval email from the procurement team explaining a raw material price increase that was never formalised in the PO.
+**What makes it easy:** Single root cause, all signals are benign (no fraud), the fix is straightforward (confirm with procurement, approve with PO amendment).
+**Optimal path (9 steps):**
+```
+run_check(po_match)
+run_check(tolerance_rule)              ← finds 3.08% > 2%
+cross_check(unit_price, invoice, po)   ← finds two mismatched lines
+run_check(grn_match)                   ← confirms delivery complete
+query_supplier(reason for increase)    ← gets email confirmation
+query_internal(procurement, confirm?)  ← procurement confirms verbal approval
+apply_rule(tolerance_exception_approval)
+make_decision(approve, reason)
+route_to(procurement, raise PO amendment)
+close_case(summary)
+```
+**Pitfalls:**
+- Rejecting without querying supplier → wrong decision, score capped at ~0.35
+- Approving without checking tolerance rule → policy violation, −0.15
+- Disabling fraud checks that aren't needed → wasted steps
+**Grader weights:**
+| Sub-score | Max | Key signals |
+|---|---|---|
+| Diagnosis | 0.32 | tolerance_rule check, price mismatch found |
+| Investigation | 0.30 | supplier queried, procurement confirmed |
+| Decision | 0.18 | correct approve decision |
+| Routing | 0.12 | PO amendment sent to procurement |
+| Closure | 0.08 | case closed with summary |
+---
+### 8.2 Task 2 — Duplicate Invoice with Hidden Tax Error (Medium)
+**Scenario:** Logistics supplier submits INV-2024-891. System flags it as a possible duplicate of INV-2024-819 (already paid). The invoice numbers differ by a digit transposition (8-9-1 vs 8-1-9). However: the original invoice applied 15% GST (wrong rate); the correct rate is 18%. The company overpaid ₹3,240 in tax on the original invoice. The new invoice has the correct rate. So it is simultaneously a duplicate AND a legitimate correction.
+**What makes it medium:** The agent must not just detect the duplicate and reject — it must also detect the tax error in the *original* paid invoice and partially approve the correction delta (₹3,240). A simple "reject all duplicates" rule misses this and loses significant score.
+**Optimal path (10 steps):**
+```
+run_check(duplicate_detection)         ← finds INV-2024-819
+inspect_field(invoice, invoice_number) ← spots digit transposition
+run_check(tax_calculation_verify)      ← finds 15% vs 18% on original
+cross_check(tax_amount, invoice, payment_history) ← confirms ₹3,240 delta
+query_internal(finance, confirm overpayment?)
+query_supplier(clarify relationship between invoices)
+apply_rule(partial_approval)
+apply_rule(credit_note_request)
+make_decision(partial_approve, reason)
+route_to(finance, process ₹3,240 correction)
+close_case(summary)
+```
+**Pitfalls:**
+- Full rejection (catches duplicate, misses correction): score ~0.35
+- Full approval (pays full duplicate): score −0.15
+- Partial approve without credit note: score ~0.60
+**Grader weights:**
+| Sub-score | Max | Key signals |
+|---|---|---|
+| Diagnosis | 0.30 | duplicate found, tax error found |
+| Investigation | 0.32 | finance queried, supplier queried, rules applied |
+| Decision | 0.20 | partial_approve correct |
+| Routing | 0.08 | finance routed for correction |
+| Closure | 0.06 | case closed |
+---
+### 8.3 Task 3 — Compound Fraud Signals (Hard)
+**Scenario:** IT equipment supplier submits an ₹8,47,500 invoice for 15 laptops. The system flags a bank account change. But there are four simultaneous fraud signals:
+1. **Bank account fraud:** Invoice has a different bank account than supplier master. The change request came from a lookalike email domain (techcore-solutions.com vs registered techcore-solutions.in) — a classic Business Email Compromise (BEC) attack.
+2. **GSTIN fraud:** The GST number on the invoice belongs to a *different company* — "TechCore Trading Pvt Ltd" in Delhi, not "TechCore Solutions."
+3. **Quantity mismatch:** Invoice claims 15 laptops. GRN confirms only 13 received. 2 are still in transit.
+4. **Price inflation:** ₹56,500/unit invoiced vs ₹52,000/unit on PO — 8.65% above PO with no approved revision.
+**Critical trap:** If the agent queries the supplier via email, they contact the fraudster who will try to convince them to approve. The agent must use phone/registered channel.
+**What makes it hard:** The agent must hold four independent fraud signals simultaneously, not be satisfied by finding one and stopping, detect the correct communication channel policy, and route to both legal and security (not just finance).
+**Optimal path (12-14 steps):**
+```
+inspect_field(invoice, bank_account)            ← sees mismatch
+cross_check(bank_account, invoice, supplier_master)
+run_check(bank_account_verification)            ← finds lookalike domain
+run_check(email_domain_verification)
+inspect_field(invoice, supplier_gstin)
+run_check(gst_verification)                     ← finds GST belongs to different entity
+cross_check(gstin, invoice, supplier_master)
+inspect_field(grn, items_received)
+run_check(grn_match)                            ← 13 vs 15
+run_check(price_check)                          ← 8.65% above PO
+query_supplier(confirm details, channel=phone)  ← supplier confirms fraud
+query_internal(security, investigate BEC)
+apply_rule(fraud_hold)
+make_decision(reject, all fraud signals documented)
+route_to(legal, initiate supplier audit)
+route_to(security, BEC investigation)
+close_case(fraud report summary)
+```
+**Critical pitfall — contacting via email:** −0.15 reward, and agent receives fraudster's response trying to get payment approved. Scoring penalises this heavily.
+**Grader weights:**
+| Sub-score | Max | Key signals |
+|---|---|---|
+| Diagnosis | 0.50 | bank fraud, GST fraud, quantity mismatch, domain lookalike, price inflation |
+| Investigation | 0.20 | phone contact (not email), security queried, legal queried |
+| Decision | 0.20 | reject with all signals documented |
+| Routing | 0.20 | legal + security routed |
+| Closure | 0.06 | case closed with fraud report |
+**Scoring thresholds:**
+- Find 1 signal: ~0.20
+- Find 2 signals: ~0.40
+- Find 3 signals: ~0.60
+- Find all 4 + correct routing: ~0.90+
+---
+## 9. Reward Design
+### 9.1 Philosophy
+The reward function is designed around three principles:
+**Principle 1: Every informative action gets signal.** Agents should learn that investigating is always better than guessing. Each relevant inspection, check, or query returns a positive reward proportional to how diagnostic that action is.
+**Principle 2: Dangerous actions get crushed.** Approving a fraudulent invoice, disabling security controls, or contacting a supplier via a compromised channel are not mistakes — they are catastrophic errors. These must receive large negative rewards so agents learn to avoid them unconditionally.
+**Principle 3: The grader is the ground truth, the shaped reward is the training signal.** The episode reward is shaped to help agents learn. The grader score at the end is what actually measures quality.
+### 9.2 Reward Table
+| Action | Reward Range | Notes |
+|---|---|---|
+| `inspect_field` (relevant) | +0.01 to +0.14 | Higher for fields that reveal anomalies |
+| `inspect_field` (irrelevant) | +0.01 | Still small positive — exploration is fine |
+| `cross_check` (finds mismatch) | +0.12 to +0.15 | Diagnosis reward |
+| `cross_check` (no mismatch) | +0.02 | Confirms a clean field |
+| `run_check` (finds issue) | +0.08 to +0.18 | Higher for more diagnostic checks |
+| `run_check` (clean) | +0.01 to +0.06 | Clean checks still confirm facts |
+| `query_supplier` (phone) | +0.10 to +0.15 | Correct channel |
+| `query_supplier` (email, fraud task) | −0.15 | Contacts fraudster |
+| `query_internal` (key dept) | +0.04 to +0.12 | Higher for departments that add critical info |
+| `apply_rule` (correct rule) | +0.08 to +0.12 | Applying the right policy pathway |
+| `apply_rule` (wrong rule) | −0.05 to −0.10 | Misapplying policy |
+| `make_decision` (correct) | +0.18 to +0.28 | Correct decision based on evidence |
+| `make_decision` (wrong) | −0.10 to −0.40 | Severity scales with how wrong |
+| `route_to` (correct team) | +0.06 to +0.14 | Right escalation path |
+| `close_case` (complete) | +0.06 to +0.12 | Depends on decision quality |
+| Repeat action | −0.02 to −0.05 | Light penalty, not catastrophic |
+| SLA breach (exceed max steps) | −0.10 | One-time penalty at end |
+### 9.3 Episode Score vs Cumulative Reward
+These are different numbers:
+- **Cumulative reward** is the sum of step rewards. It is used as a training signal.
+- **Episode score** (from `grade()`) is the holistic quality assessment. It is what the hackathon evaluates.
+Agents should be optimised on the grade score, not the cumulative reward alone.
+---
+## 10. Evaluation Criteria
+### 10.1 Hackathon Scoring
+| Criterion | Weight | What judges look for |
+|---|---|---|
+| Real-world utility | 30% | Would an enterprise actually use this? Does it model the task faithfully? |
+| Task & grader quality | 25% | Clear objectives, accurate grading, genuine difficulty progression, frontier models challenged |
+| Environment design | 20% | Clean state management, good action/observation spaces, shaped reward, sensible episode boundaries |
+| Code quality & spec compliance | 15% | OpenEnv spec passes, Dockerfile works, baseline reproduces, typed models |
+| Creativity & novelty | 10% | Novel domain, interesting mechanics, original reward design |
+### 10.2 Automated Gates (must all pass)
+1. HF Space deploys — `POST /reset` returns 200
+2. `openenv validate` passes
+3. `docker build` succeeds
+4. `python inference.py` runs without error, produces scores
+5. All 3 tasks enumerated, grader scores verified in [0.0, 1.0]
+### 10.3 Phase 2 — Agentic Evaluation
+The hackathon will run a standard open LLM agent (e.g. Nemotron 3 Super) against the environment. The environment must:
+- Not be trivially solvable by a greedy agent
+- Produce score variance across tasks (not all the same)
+- Penalise clearly suboptimal behaviour
+### 10.4 Disqualifiers
+- Environment does not deploy or respond to `/reset`
+- Graders that always return the same score regardless of actions
+- `inference.py` not in root, or not using OpenAI client
+- No baseline scores produced
+- Plagiarised environment
+---
+## 11. API Contract
+### 11.1 Environment Python API
+```python
+env = InvoiceExceptionEnv(seed=42)
+# Reset — returns EnvironmentState
+obs: EnvironmentState = env.reset("task1_price_variance")
+# Step — returns StepResult
+result: StepResult = env.step(Action.run_check("tolerance_rule"))
+# result.observation  →  EnvironmentState
+# result.reward       →  float
+# result.done         →  bool
+# result.info         →  dict
+# State — non-destructive peek
+obs: EnvironmentState = env.state()
+# Grade — run grader on episode
+scores: dict = env.grade()
+# scores["score"]           → 0.0–1.0 overall
+# scores["diagnosis_score"] → float
+# scores["decision_score"]  → float
+# ...
+```
+### 11.2 HTTP API
+```
+POST /reset
+Body: {"task_id": "task1_price_variance"}   (optional — random if omitted)
+Response: 200 EnvironmentState JSON
+POST /step
+Body: {"type": "run_check", "params": {"check_name": "tolerance_rule"}}
+Response: 200 StepResult JSON
+GET /state
+Response: 200 EnvironmentState JSON
+POST /grade
+Response: 200 {"score": 0.85, "diagnosis_score": ...}
+GET /tasks
+Response: 200 ["task1_price_variance", "task2_duplicate_tax", "task3_compound_fraud"]
+GET /health
+Response: 200 {"status": "ok", "version": "1.0.0"}
+```
+### 11.3 Action Schema
+```json
+{
+  "type": "run_check",
+  "params": {"check_name": "tolerance_rule"}
+}
+{
+  "type": "inspect_field",
+  "params": {"document": "invoice", "field": "bank_account"}
+}
+{
+  "type": "cross_check",
+  "params": {"field": "unit_price", "doc_a": "invoice", "doc_b": "po"}
+}
+{
+  "type": "query_supplier",
+  "params": {"question": "Why does your bank account differ?", "channel": "phone"}
+}
+{
+  "type": "query_internal",
+  "params": {"department": "procurement", "question": "Did you approve this price?"}
+}
+{
+  "type": "apply_rule",
+  "params": {"rule_id": "tolerance_exception_approval"}
+}
+{
+  "type": "make_decision",
+  "params": {"decision": "approve", "reason": "Verbal approval confirmed by procurement."}
+}
+{
+  "type": "route_to",
+  "params": {"team": "procurement", "notes": "Please raise PO amendment for the price variance."}
+}
+{
+  "type": "close_case",
+  "params": {"summary": "Invoice approved. PO amendment requested. Case closed."}
+}
+```
+---
+## 12. File Structure
+```
+invoice-exception-handler/
+│
+├── README.md                    # Full setup + usage guide
+├── openenv.yaml                 # OpenEnv spec (must pass openenv validate)
+├── Dockerfile                   # Single-stage Python 3.11-slim build
+├── requirements.txt             # Pinned dependencies
+├── inference.py                 # Competition inference script (MUST be here)
+├── app.py                       # Gradio + FastAPI entrypoint for HF Spaces
+│
+├── env/
+│   ├── __init__.py
+│   ├── models.py                # All Pydantic typed models
+│   ├── environment.py           # InvoiceExceptionEnv class
+│   └── tasks.py                 # 3 task classes + graders + EpisodeData
+│
+└── documents/
+    ├── PRD-001-product-requirements.md    # This document
+    ├── CHANGELOG.md                       # Every code change recorded
+    ├── ARCHITECTURE.md                    # System diagram + decisions
+    └── BASELINE-SCORES.md                 # Reproducible benchmark results
+```
+---
+## 13. Out of Scope
+The following are explicitly not part of v1.0:
+- Real database connectivity (the environment is fully simulated)
+- Multi-agent scenarios (one agent per episode)
+- Partial observability (agent sees all documents from the start)
+- User interface for human play (nice-to-have but not required for submission)
+- Real supplier APIs (simulation only)
+- Currency other than INR (can be extended in v1.1)
+- Tasks beyond 3 (can be extended)
+---
+## 14. Change Log
+| Version | Date | Author | Change |
+|---|---|---|---|
+| 0.1.0 | 2025-01-18 | [Author] | Initial draft — problem definition and task sketches |
+| 0.2.0 | 2025-01-19 | [Author] | Added reward design section, API contract, file structure |
+| 1.0.0 | 2025-01-20 | [Author] | Final version — all sections complete, ready for implementation |

documents/PRD.md ADDED Viewed

	@@ -0,0 +1,605 @@

+# Product Requirements Document
+## Invoice Exception Handler — OpenEnv Agent Learning Environment
+**Document ID:** PRD-001
+**Version:** 1.0.0
+**Status:** Final
+**Author:** [Your Name]
+**Last Updated:** 2025-01-20
+**Classification:** Internal / Hackathon Submission
+---
+## Table of Contents
+1. [Executive Summary](#1-executive-summary)
+2. [Problem Statement](#2-problem-statement)
+3. [Product Vision](#3-product-vision)
+4. [Stakeholders](#4-stakeholders)
+5. [Functional Requirements](#5-functional-requirements)
+6. [Non-Functional Requirements](#6-non-functional-requirements)
+7. [System Architecture](#7-system-architecture)
+8. [Task Specifications](#8-task-specifications)
+9. [Reward Design](#9-reward-design)
+10. [Evaluation Criteria](#10-evaluation-criteria)
+11. [API Contract](#11-api-contract)
+12. [File Structure](#12-file-structure)
+13. [Out of Scope](#13-out-of-scope)
+14. [Change Log](#14-change-log)
+---
+## 1. Executive Summary
+The Invoice Exception Handler is a real-world agent learning environment built for the OpenEnv standard. It simulates the accounts payable (AP) exception handling workflow that every business on earth runs daily — the process of investigating flagged invoices before payment is approved.
+The environment places an AI agent in the role of an AP analyst. The agent receives a document packet (Purchase Order, Invoice, Goods Receipt Note, Supplier Master), reads an exception flag, and must investigate the root cause, make a decision, route the case to the right team, and close it cleanly. Every action has realistic financial and compliance consequences.
+The environment ships with three tasks of increasing difficulty — price variance (easy), duplicate with hidden tax error (medium), and compound fraud with four simultaneous signals (hard).
+---
+## 2. Problem Statement
+### 2.1 The Real-World Pain
+Every company that buys goods or services from suppliers receives invoices. Typically 5–15% of all invoices have exceptions — discrepancies between what was ordered (PO), what was received (GRN), and what was invoiced. These exceptions are currently handled by accounts payable clerks who manually:
+1. Pull the original Purchase Order
+2. Compare it line by line against the invoice
+3. Check the Goods Receipt Note
+4. Run validation checks
+5. Query internal teams or the supplier
+6. Make a decision (approve / reject / hold / partial approve)
+7. Route the case and document everything
+At a mid-size company this is 2–4 hours of analyst time per day. At enterprise scale it is entire departments. The cost to the AP automation market exceeds $3 billion annually.
+### 2.2 The AI Gap
+No existing OpenEnv benchmark tests an agent's ability to:
+- Reason across multiple documents simultaneously
+- Apply business rules with thresholds and exceptions
+- Detect fraud signals that require cross-referencing
+- Make nuanced decisions (partial approve, hold, escalate)
+- Know *not* to contact a supplier via a potentially compromised channel
+This gap means agents trained on existing benchmarks cannot be evaluated or trained on one of the most common finance workflows in enterprise software.
+### 2.3 What This Environment Fixes
+The Invoice Exception Handler provides:
+- A clean, typed, deterministic simulation of AP exception handling
+- Three tasks that test a progression of reasoning: threshold logic → duplicate detection → multi-signal fraud
+- Shaped rewards that signal progress at every step, not just at episode end
+- A fully deployable environment that conforms to the OpenEnv spec
+---
+## 3. Product Vision
+> An agent that scores well in this environment is demonstrably better at AP exception handling than the average accounts payable clerk — and is ready to be deployed in real enterprise finance workflows.
+The environment is designed so that:
+- The reward signal is meaningful enough to actually train agents on, not just evaluate them
+- The hard task (compound fraud) remains genuinely difficult for frontier models
+- Every score between 0.0 and 1.0 reflects a real quality difference in agent behavior
+---
+## 4. Stakeholders
+| Stakeholder | Role | Interest |
+|---|---|---|
+| Hackathon Judges (Meta, HF engineers) | Evaluators | Real-world utility, code quality, creativity |
+| OpenEnv Automated Validator | Gatekeeper | Spec compliance, deployment health |
+| AI Researchers | Primary users post-submission | Training and evaluating AP agents |
+| Enterprise Software Companies | Secondary users | Evaluating models for AP automation products |
+---
+## 5. Functional Requirements
+### 5.1 Core Environment API
+| Requirement | Priority | Detail |
+|---|---|---|
+| FR-001 | MUST | `env.reset(task_id)` returns a clean `EnvironmentState` |
+| FR-002 | MUST | `env.step(action)` returns `StepResult(observation, reward, done, info)` |
+| FR-003 | MUST | `env.state()` returns current state without advancing episode |
+| FR-004 | MUST | `env.grade()` returns a score dict with overall score 0.0–1.0 |
+| FR-005 | MUST | All models are typed Pydantic v2 with no untyped fields |
+| FR-006 | MUST | `openenv.yaml` passes `openenv validate` |
+### 5.2 HTTP Endpoints (for HF Spaces validator)
+| Requirement | Priority | Detail |
+|---|---|---|
+| FR-007 | MUST | `POST /reset` returns HTTP 200 with JSON observation |
+| FR-008 | MUST | `POST /step` returns HTTP 200 with JSON StepResult |
+| FR-009 | MUST | `GET /state` returns HTTP 200 with JSON EnvironmentState |
+| FR-010 | MUST | `GET /health` returns HTTP 200 `{"status": "ok"}` |
+| FR-011 | SHOULD | `GET /` returns HTML documentation page |
+### 5.3 Task Requirements
+| Requirement | Priority | Detail |
+|---|---|---|
+| FR-012 | MUST | Minimum 3 tasks with distinct scenarios |
+| FR-013 | MUST | Tasks range easy → medium → hard |
+| FR-014 | MUST | Each task has a deterministic grader returning 0.0–1.0 |
+| FR-015 | MUST | Graders have sub-scores (diagnosis, investigation, decision, routing, closure, efficiency) |
+| FR-016 | MUST | Hard task must not be solvable by simple heuristics |
+### 5.4 Reward Function
+| Requirement | Priority | Detail |
+|---|---|---|
+| FR-017 | MUST | Reward is shaped across the full trajectory |
+| FR-018 | MUST | Dangerous actions (approving fraud) produce large negative rewards |
+| FR-019 | MUST | Repeating already-completed actions penalised lightly |
+| FR-020 | MUST | Exceeding step budget penalised (SLA concept) |
+| FR-021 | SHOULD | Efficiency bonus for completing faster than optimal |
+### 5.5 Inference Script
+| Requirement | Priority | Detail |
+|---|---|---|
+| FR-022 | MUST | Script named exactly `inference.py` in root directory |
+| FR-023 | MUST | Uses OpenAI client (not Anthropic SDK) |
+| FR-024 | MUST | Reads `API_BASE_URL`, `MODEL_NAME`, `HF_TOKEN` from environment |
+| FR-025 | MUST | Emits `[START]`, `[STEP]`, `[END]` lines to stdout exactly as spec |
+| FR-026 | MUST | Completes all 3 tasks in under 20 minutes on 2 vCPU / 8 GB RAM |
+| FR-027 | MUST | Produces reproducible scores with the same seed |
+### 5.6 Deployment
+| Requirement | Priority | Detail |
+|---|---|---|
+| FR-028 | MUST | Dockerfile builds cleanly without internet access at run time |
+| FR-029 | MUST | Container starts and serves on port 7860 |
+| FR-030 | MUST | HF Spaces `POST /reset` returns 200 |
+| FR-031 | MUST | README documents setup, action space, observation space, tasks, baseline scores |
+---
+## 6. Non-Functional Requirements
+| ID | Category | Requirement |
+|---|---|---|
+| NFR-001 | Performance | `reset()` completes in < 100ms |
+| NFR-002 | Performance | `step()` completes in < 50ms |
+| NFR-003 | Performance | Full 3-task inference run completes in < 20 minutes |
+| NFR-004 | Resource | Runs on 2 vCPU, 8 GB RAM — no GPU required |
+| NFR-005 | Correctness | Grader output is deterministic — same actions = same score |
+| NFR-006 | Correctness | Reward values are deterministic — no randomness in simulation |
+| NFR-007 | Code quality | No bare `except:` blocks — all exceptions typed |
+| NFR-008 | Code quality | All functions have docstrings |
+| NFR-009 | Code quality | Type hints on all function signatures |
+| NFR-010 | Portability | Zero OS-specific code — runs on Linux (Docker) |
+| NFR-011 | Security | No hardcoded credentials anywhere in code |
+---
+## 7. System Architecture
+```
+┌─────────────────────────────────────────────────────────────┐
+│                     HF Space / Docker Container             │
+│                                                             │
+│  ┌──────────────┐    ┌──────────────────────────────────┐   │
+│  │  Gradio UI   │    │         FastAPI Server            │   │
+│  │  (port 7860) │    │  POST /reset  GET /state          │   │
+│  │              │    │  POST /step   GET /health         │   │
+│  └──────┬───────┘    └──────────────┬───────────────────┘   │
+│         │                           │                        │
+│         └──────────┬────────────────┘                        │
+│                    │                                         │
+│         ┌──────────▼──────────────┐                          │
+│         │   InvoiceExceptionEnv   │                          │
+│         │  reset() step() state() │                          │
+│         │  grade()                │                          │
+│         └──────────┬──────────────┘                          │
+│                    │                                         │
+│         ┌──────────▼──────────────┐                          │
+│         │      Task Registry      │                          │
+│         │  task1_price_variance   │                          │
+│         │  task2_duplicate_tax    │                          │
+│         │  task3_compound_fraud   │                          │
+│         └─────────────────────────┘                          │
+└─────────────────────────────────────────────────────────────┘
+┌─────────────────────────────────────────────────────────────┐
+│                    inference.py (agent)                      │
+│                                                             │
+│  OpenAI Client → env.reset() → loop {                       │
+│    action = LLM(observation_json)                           │
+│    result = env.step(action)                                │
+│    log [STEP]                                               │
+│  } → log [END]                                              │
+└─────────────────────────────────────────────────────────────┘
+```
+### 7.1 Data Flow
+```
+Episode start
+    │
+    ▼
+reset(task_id) ──► builds DocumentPacket + EpisodeData ──► EnvironmentState
+    │
+    ▼
+step(action) ──► dispatch to task simulator ──► (reward, info)
+    │                                               │
+    ▼                                               ▼
+EpisodeData updated ◄──────────────────── append to history
+    │
+    ▼
+new EnvironmentState built ──► StepResult(obs, reward, done, info)
+    │
+    ▼
+grade() ──► EpisodeData ──► grader logic ──► Dict[str, float]
+```
+---
+## 8. Task Specifications
+### 8.1 Task 1 — Price Variance Exception (Easy)
+**Scenario:** Office stationery invoice arrives 3.08% above the PO amount. Company tolerance policy is ±2% for auto-approval. The supplier has a verbal approval email from the procurement team explaining a raw material price increase that was never formalised in the PO.
+**What makes it easy:** Single root cause, all signals are benign (no fraud), the fix is straightforward (confirm with procurement, approve with PO amendment).
+**Optimal path (9 steps):**
+```
+run_check(po_match)
+run_check(tolerance_rule)              ← finds 3.08% > 2%
+cross_check(unit_price, invoice, po)   ← finds two mismatched lines
+run_check(grn_match)                   ← confirms delivery complete
+query_supplier(reason for increase)    ← gets email confirmation
+query_internal(procurement, confirm?)  ← procurement confirms verbal approval
+apply_rule(tolerance_exception_approval)
+make_decision(approve, reason)
+route_to(procurement, raise PO amendment)
+close_case(summary)
+```
+**Pitfalls:**
+- Rejecting without querying supplier → wrong decision, score capped at ~0.35
+- Approving without checking tolerance rule → policy violation, −0.15
+- Disabling fraud checks that aren't needed → wasted steps
+**Grader weights:**
+| Sub-score | Max | Key signals |
+|---|---|---|
+| Diagnosis | 0.32 | tolerance_rule check, price mismatch found |
+| Investigation | 0.30 | supplier queried, procurement confirmed |
+| Decision | 0.18 | correct approve decision |
+| Routing | 0.12 | PO amendment sent to procurement |
+| Closure | 0.08 | case closed with summary |
+---
+### 8.2 Task 2 — Duplicate Invoice with Hidden Tax Error (Medium)
+**Scenario:** Logistics supplier submits INV-2024-891. System flags it as a possible duplicate of INV-2024-819 (already paid). The invoice numbers differ by a digit transposition (8-9-1 vs 8-1-9). However: the original invoice applied 15% GST (wrong rate); the correct rate is 18%. The company overpaid ₹3,240 in tax on the original invoice. The new invoice has the correct rate. So it is simultaneously a duplicate AND a legitimate correction.
+**What makes it medium:** The agent must not just detect the duplicate and reject — it must also detect the tax error in the *original* paid invoice and partially approve the correction delta (₹3,240). A simple "reject all duplicates" rule misses this and loses significant score.
+**Optimal path (10 steps):**
+```
+run_check(duplicate_detection)         ← finds INV-2024-819
+inspect_field(invoice, invoice_number) ← spots digit transposition
+run_check(tax_calculation_verify)      ← finds 15% vs 18% on original
+cross_check(tax_amount, invoice, payment_history) ← confirms ₹3,240 delta
+query_internal(finance, confirm overpayment?)
+query_supplier(clarify relationship between invoices)
+apply_rule(partial_approval)
+apply_rule(credit_note_request)
+make_decision(partial_approve, reason)
+route_to(finance, process ₹3,240 correction)
+close_case(summary)
+```
+**Pitfalls:**
+- Full rejection (catches duplicate, misses correction): score ~0.35
+- Full approval (pays full duplicate): score −0.15
+- Partial approve without credit note: score ~0.60
+**Grader weights:**
+| Sub-score | Max | Key signals |
+|---|---|---|
+| Diagnosis | 0.30 | duplicate found, tax error found |
+| Investigation | 0.32 | finance queried, supplier queried, rules applied |
+| Decision | 0.20 | partial_approve correct |
+| Routing | 0.08 | finance routed for correction |
+| Closure | 0.06 | case closed |
+---
+### 8.3 Task 3 — Compound Fraud Signals (Hard)
+**Scenario:** IT equipment supplier submits an ₹8,47,500 invoice for 15 laptops. The system flags a bank account change. But there are four simultaneous fraud signals:
+1. **Bank account fraud:** Invoice has a different bank account than supplier master. The change request came from a lookalike email domain (techcore-solutions.com vs registered techcore-solutions.in) — a classic Business Email Compromise (BEC) attack.
+2. **GSTIN fraud:** The GST number on the invoice belongs to a *different company* — "TechCore Trading Pvt Ltd" in Delhi, not "TechCore Solutions."
+3. **Quantity mismatch:** Invoice claims 15 laptops. GRN confirms only 13 received. 2 are still in transit.
+4. **Price inflation:** ₹56,500/unit invoiced vs ₹52,000/unit on PO — 8.65% above PO with no approved revision.
+**Critical trap:** If the agent queries the supplier via email, they contact the fraudster who will try to convince them to approve. The agent must use phone/registered channel.
+**What makes it hard:** The agent must hold four independent fraud signals simultaneously, not be satisfied by finding one and stopping, detect the correct communication channel policy, and route to both legal and security (not just finance).
+**Optimal path (12-14 steps):**
+```
+inspect_field(invoice, bank_account)            ← sees mismatch
+cross_check(bank_account, invoice, supplier_master)
+run_check(bank_account_verification)            ← finds lookalike domain
+run_check(email_domain_verification)
+inspect_field(invoice, supplier_gstin)
+run_check(gst_verification)                     ← finds GST belongs to different entity
+cross_check(gstin, invoice, supplier_master)
+inspect_field(grn, items_received)
+run_check(grn_match)                            ← 13 vs 15
+run_check(price_check)                          ← 8.65% above PO
+query_supplier(confirm details, channel=phone)  ← supplier confirms fraud
+query_internal(security, investigate BEC)
+apply_rule(fraud_hold)
+make_decision(reject, all fraud signals documented)
+route_to(legal, initiate supplier audit)
+route_to(security, BEC investigation)
+close_case(fraud report summary)
+```
+**Critical pitfall — contacting via email:** −0.15 reward, and agent receives fraudster's response trying to get payment approved. Scoring penalises this heavily.
+**Grader weights:**
+| Sub-score | Max | Key signals |
+|---|---|---|
+| Diagnosis | 0.50 | bank fraud, GST fraud, quantity mismatch, domain lookalike, price inflation |
+| Investigation | 0.20 | phone contact (not email), security queried, legal queried |
+| Decision | 0.20 | reject with all signals documented |
+| Routing | 0.20 | legal + security routed |
+| Closure | 0.06 | case closed with fraud report |
+**Scoring thresholds:**
+- Find 1 signal: ~0.20
+- Find 2 signals: ~0.40
+- Find 3 signals: ~0.60
+- Find all 4 + correct routing: ~0.90+
+---
+## 9. Reward Design
+### 9.1 Philosophy
+The reward function is designed around three principles:
+**Principle 1: Every informative action gets signal.** Agents should learn that investigating is always better than guessing. Each relevant inspection, check, or query returns a positive reward proportional to how diagnostic that action is.
+**Principle 2: Dangerous actions get crushed.** Approving a fraudulent invoice, disabling security controls, or contacting a supplier via a compromised channel are not mistakes — they are catastrophic errors. These must receive large negative rewards so agents learn to avoid them unconditionally.
+**Principle 3: The grader is the ground truth, the shaped reward is the training signal.** The episode reward is shaped to help agents learn. The grader score at the end is what actually measures quality.
+### 9.2 Reward Table
+| Action | Reward Range | Notes |
+|---|---|---|
+| `inspect_field` (relevant) | +0.01 to +0.14 | Higher for fields that reveal anomalies |
+| `inspect_field` (irrelevant) | +0.01 | Still small positive — exploration is fine |
+| `cross_check` (finds mismatch) | +0.12 to +0.15 | Diagnosis reward |
+| `cross_check` (no mismatch) | +0.02 | Confirms a clean field |
+| `run_check` (finds issue) | +0.08 to +0.18 | Higher for more diagnostic checks |
+| `run_check` (clean) | +0.01 to +0.06 | Clean checks still confirm facts |
+| `query_supplier` (phone) | +0.10 to +0.15 | Correct channel |
+| `query_supplier` (email, fraud task) | −0.15 | Contacts fraudster |
+| `query_internal` (key dept) | +0.04 to +0.12 | Higher for departments that add critical info |
+| `apply_rule` (correct rule) | +0.08 to +0.12 | Applying the right policy pathway |
+| `apply_rule` (wrong rule) | −0.05 to −0.10 | Misapplying policy |
+| `make_decision` (correct) | +0.18 to +0.28 | Correct decision based on evidence |
+| `make_decision` (wrong) | −0.10 to −0.40 | Severity scales with how wrong |
+| `route_to` (correct team) | +0.06 to +0.14 | Right escalation path |
+| `close_case` (complete) | +0.06 to +0.12 | Depends on decision quality |
+| Repeat action | −0.02 to −0.05 | Light penalty, not catastrophic |
+| SLA breach (exceed max steps) | −0.10 | One-time penalty at end |
+### 9.3 Episode Score vs Cumulative Reward
+These are different numbers:
+- **Cumulative reward** is the sum of step rewards. It is used as a training signal.
+- **Episode score** (from `grade()`) is the holistic quality assessment. It is what the hackathon evaluates.
+Agents should be optimised on the grade score, not the cumulative reward alone.
+---
+## 10. Evaluation Criteria
+### 10.1 Hackathon Scoring
+| Criterion | Weight | What judges look for |
+|---|---|---|
+| Real-world utility | 30% | Would an enterprise actually use this? Does it model the task faithfully? |
+| Task & grader quality | 25% | Clear objectives, accurate grading, genuine difficulty progression, frontier models challenged |
+| Environment design | 20% | Clean state management, good action/observation spaces, shaped reward, sensible episode boundaries |
+| Code quality & spec compliance | 15% | OpenEnv spec passes, Dockerfile works, baseline reproduces, typed models |
+| Creativity & novelty | 10% | Novel domain, interesting mechanics, original reward design |
+### 10.2 Automated Gates (must all pass)
+1. HF Space deploys — `POST /reset` returns 200
+2. `openenv validate` passes
+3. `docker build` succeeds
+4. `python inference.py` runs without error, produces scores
+5. All 3 tasks enumerated, grader scores verified in [0.0, 1.0]
+### 10.3 Phase 2 — Agentic Evaluation
+The hackathon will run a standard open LLM agent (e.g. Nemotron 3 Super) against the environment. The environment must:
+- Not be trivially solvable by a greedy agent
+- Produce score variance across tasks (not all the same)
+- Penalise clearly suboptimal behaviour
+### 10.4 Disqualifiers
+- Environment does not deploy or respond to `/reset`
+- Graders that always return the same score regardless of actions
+- `inference.py` not in root, or not using OpenAI client
+- No baseline scores produced
+- Plagiarised environment
+---
+## 11. API Contract
+### 11.1 Environment Python API
+```python
+env = InvoiceExceptionEnv(seed=42)
+# Reset — returns EnvironmentState
+obs: EnvironmentState = env.reset("task1_price_variance")
+# Step — returns StepResult
+result: StepResult = env.step(Action.run_check("tolerance_rule"))
+# result.observation  →  EnvironmentState
+# result.reward       →  float
+# result.done         →  bool
+# result.info         →  dict
+# State — non-destructive peek
+obs: EnvironmentState = env.state()
+# Grade — run grader on episode
+scores: dict = env.grade()
+# scores["score"]           → 0.0–1.0 overall
+# scores["diagnosis_score"] → float
+# scores["decision_score"]  → float
+# ...
+```
+### 11.2 HTTP API
+```
+POST /reset
+Body: {"task_id": "task1_price_variance"}   (optional — random if omitted)
+Response: 200 EnvironmentState JSON
+POST /step
+Body: {"type": "run_check", "params": {"check_name": "tolerance_rule"}}
+Response: 200 StepResult JSON
+GET /state
+Response: 200 EnvironmentState JSON
+POST /grade
+Response: 200 {"score": 0.85, "diagnosis_score": ...}
+GET /tasks
+Response: 200 ["task1_price_variance", "task2_duplicate_tax", "task3_compound_fraud"]
+GET /health
+Response: 200 {"status": "ok", "version": "1.0.0"}
+```
+### 11.3 Action Schema
+```json
+{
+  "type": "run_check",
+  "params": {"check_name": "tolerance_rule"}
+}
+{
+  "type": "inspect_field",
+  "params": {"document": "invoice", "field": "bank_account"}
+}
+{
+  "type": "cross_check",
+  "params": {"field": "unit_price", "doc_a": "invoice", "doc_b": "po"}
+}
+{
+  "type": "query_supplier",
+  "params": {"question": "Why does your bank account differ?", "channel": "phone"}
+}
+{
+  "type": "query_internal",
+  "params": {"department": "procurement", "question": "Did you approve this price?"}
+}
+{
+  "type": "apply_rule",
+  "params": {"rule_id": "tolerance_exception_approval"}
+}
+{
+  "type": "make_decision",
+  "params": {"decision": "approve", "reason": "Verbal approval confirmed by procurement."}
+}
+{
+  "type": "route_to",
+  "params": {"team": "procurement", "notes": "Please raise PO amendment for the price variance."}
+}
+{
+  "type": "close_case",
+  "params": {"summary": "Invoice approved. PO amendment requested. Case closed."}
+}
+```
+---
+## 12. File Structure
+```
+invoice-exception-handler/
+│
+├── README.md                    # Full setup + usage guide
+├── openenv.yaml                 # OpenEnv spec (must pass openenv validate)
+├── Dockerfile                   # Single-stage Python 3.11-slim build
+├── requirements.txt             # Pinned dependencies
+├── inference.py                 # Competition inference script (MUST be here)
+├── app.py                       # Gradio + FastAPI entrypoint for HF Spaces
+│
+├── env/
+│   ├── __init__.py
+│   ├── models.py                # All Pydantic typed models
+│   ├── environment.py           # InvoiceExceptionEnv class
+│   └── tasks.py                 # 3 task classes + graders + EpisodeData
+│
+└── documents/
+    ├── PRD-001-product-requirements.md    # This document
+    ├── CHANGELOG.md                       # Every code change recorded
+    ├── ARCHITECTURE.md                    # System diagram + decisions
+    └── BASELINE-SCORES.md                 # Reproducible benchmark results
+```
+---
+## 13. Out of Scope
+The following are explicitly not part of v1.0:
+- Real database connectivity (the environment is fully simulated)
+- Multi-agent scenarios (one agent per episode)
+- Partial observability (agent sees all documents from the start)
+- User interface for human play (nice-to-have but not required for submission)
+- Real supplier APIs (simulation only)
+- Currency other than INR (can be extended in v1.1)
+- Tasks beyond 3 (can be extended)
+---
+## 14. Change Log
+| Version | Date | Author | Change |
+|---|---|---|---|
+| 0.1.0 | 2025-01-18 | [Author] | Initial draft — problem definition and task sketches |
+| 0.2.0 | 2025-01-19 | [Author] | Added reward design section, API contract, file structure |
+| 1.0.0 | 2025-01-20 | [Author] | Final version — all sections complete, ready for implementation |

documents/README.md ADDED Viewed

	@@ -0,0 +1,1610 @@

+# Invoice Exception Handler — OpenEnv
+> An AI agent learning environment that simulates accounts payable exception handling.
+> The agent acts as an AP analyst: receives flagged invoices, investigates root causes,
+> makes decisions, and closes cases. Built for the OpenEnv hackathon.
+[![OpenEnv](https://img.shields.io/badge/OpenEnv-1.0.0-blue)](https://github.com/openenv/openenv)
+[![Python](https://img.shields.io/badge/Python-3.11+-green)](https://python.org)
+[![HF Space](https://img.shields.io/badge/HF%20Space-Live-yellow)](https://huggingface.co/spaces)
+---
+## For Agents Building This Project
+This README is the single source of truth for building the entire project from scratch.
+Read every section before writing any code. Do not skip sections. Do not guess.
+### Ground Rules
+1. **Write code like a human wrote it.** Use real variable names, not `x` or `tmp`. Add comments where
+   the logic is non-obvious. Leave one blank line between logical blocks inside functions. Use 4-space
+   indentation everywhere. Python files get a module docstring at the top explaining what the file does.
+2. **Create a new Git repo and push after every major milestone.** A milestone is: models done,
+   tasks done, environment done, API done, inference done, app done. Not after every file.
+3. **Record every change in `documents/CHANGELOG.md`.** Use the format in the changelog section below.
+   Before pushing, append to the changelog what changed and why.
+4. **If something in this README conflicts with the competition spec, the competition spec wins.**
+   The competition spec is in the document the user shared. Key points: `inference.py` must use the
+   OpenAI client. `[START]` `[STEP]` `[END]` format must be exact. `/reset` must return 200.
+5. **Test before pushing.** Run `python -c "from env import InvoiceExceptionEnv"` to check imports.
+   Run `python inference.py` with a dummy API key to check the log format. Run `docker build .` to
+   check the Dockerfile before claiming it works.
+---
+## What This Environment Does
+Every company that buys goods or services deals with invoice exceptions — mismatches between
+what was ordered, what arrived, and what was invoiced. Currently these are handled by accounts
+payable analysts who manually compare documents and decide: approve, reject, hold, or escalate.
+This environment puts an AI agent in that analyst's chair. The agent receives:
+- A **Purchase Order** (what was agreed to)
+- An **Invoice** (what the supplier is claiming)
+- A **Goods Receipt Note** (what actually arrived)
+- A **Supplier Master** (the verified supplier record)
+- An **Exception Flag** (why the system flagged this invoice)
+The agent investigates, runs checks, queries people, makes a decision, and closes the case.
+Every action has realistic consequences including financial, compliance, and fraud implications.
+---
+## Repository Structure
+Build the project with exactly this structure. Do not add extra directories. Do not rename files.
+```
+invoice-exception-handler/
+│
+├── README.md                          ← this file
+├── openenv.yaml                       ← OpenEnv spec, must pass openenv validate
+├── Dockerfile                         ← single-stage Python 3.11-slim
+├── requirements.txt                   ← pinned versions
+├── inference.py                       ← competition script, MUST be named this
+├── app.py                             ← Gradio + FastAPI, entry point for HF Spaces
+│
+├── env/
+│   ├── __init__.py                    ← exports InvoiceExceptionEnv, Action, ALL_TASKS
+│   ├── models.py                      ← all Pydantic models (Action, EnvironmentState, etc.)
+│   ├── environment.py                 ← InvoiceExceptionEnv class
+│   └── tasks.py                       ← 3 task classes, EpisodeData, graders
+│
+└── documents/
+    ├── PRD-001-product-requirements.md
+    ├── CHANGELOG.md
+    ├── ARCHITECTURE.md
+    └── BASELINE-SCORES.md
+```
+---
+## Step-by-Step Build Order
+Follow this order exactly. Do not jump ahead.
+```
+Step 1 → Create the repo
+Step 2 → Write requirements.txt
+Step 3 → Write env/models.py
+Step 4 → Write env/tasks.py
+Step 5 → Write env/environment.py
+Step 6 → Write env/__init__.py
+Step 7 → Smoke test the environment (run a quick script)
+Step 8 → Write openenv.yaml
+Step 9 → Write inference.py
+Step 10 → Write app.py
+Step 11 → Write Dockerfile
+Step 12 → Full end-to-end test
+Step 13 → Write documents/
+Step 14 → Push and verify
+```
+---
+## Step 1 — Create the Repo
+```bash
+# Create the project directory
+mkdir invoice-exception-handler
+cd invoice-exception-handler
+# Initialise git
+git init
+git checkout -b main
+# Create the directory structure
+mkdir -p env documents
+# Create empty placeholder files so git tracks the structure
+touch env/__init__.py
+touch documents/.gitkeep
+# First commit — skeleton only
+git add .
+git commit -m "init: project skeleton"
+# Create the repo on GitHub/HF and push
+# Replace with your actual remote
+git remote add origin https://github.com/YOUR_USERNAME/invoice-exception-handler.git
+git push -u origin main
+```
+---
+## Step 2 — requirements.txt
+Pin every version. Do not use `>=` ranges — the validator builds in a clean environment and
+range mismatches cause mysterious failures.
+```
+pydantic==2.7.1
+fastapi==0.111.0
+uvicorn==0.29.0
+gradio==4.36.1
+openai==1.35.3
+pyyaml==6.0.1
+httpx==0.27.0
+python-multipart==0.0.9
+```
+---
+## Step 3 — env/models.py
+This file defines every typed object in the system. Write it before any other Python code.
+Nothing is untyped. Every field has a type annotation.
+### What goes in models.py
+**Enumerations:**
+- `ActionType` — the 9 action types an agent can take (string enum)
+- `DecisionType` — approve / reject / hold / partial_approve (string enum)
+- `CaseStatus` — open / in_review / decided / routed / closed (string enum)
+**Document models** (read-only context given to the agent):
+- `LineItem` — one line on an invoice or PO (description, quantity, unit_price, total, tax_rate)
+- `PurchaseOrder` — what was agreed to be purchased
+- `Invoice` — what the supplier is claiming
+- `GoodsReceiptNote` — what actually arrived at the warehouse
+- `SupplierMaster` — the verified, registered supplier record
+- `ExceptionFlag` — why the system flagged this invoice (flag_code, description, auto_hold)
+**Action model:**
+- `Action` — has a `type: ActionType` and `params: Dict[str, Any]`
+- Add classmethod constructors for each action type so callers can do `Action.run_check("tolerance_rule")`
+**Result models:**
+- `InspectionResult` — what came back from inspect_field (document, field, value, note, timestamp)
+- `CheckResult` — what came back from run_check or cross_check (check_name, passed, detail, timestamp)
+- `QueryResult` — what came back from a query (target, question, response, channel, timestamp)
+**State models:**
+- `EnvironmentState` — the full observable state returned by reset() and step()
+- `StepResult` — what step() returns: (observation, reward, done, info)
+### EnvironmentState fields
+The EnvironmentState must include:
+- `task_id: str`
+- `step_number: int`
+- `case_status: CaseStatus`
+- All 5 documents (purchase_order, invoice, grn, supplier_master, exception_flag)
+- Agent history: `inspections`, `checks_run`, `queries`, `rules_applied`
+- Decision state: `decision`, `decision_reason`, `routed_to`, `case_closed`, `close_summary`
+- Action hints: `available_actions`, `available_checks`, `available_rules`, `knowledge_base`
+- `cumulative_reward: float`
+### Writing style for models.py
+```python
+"""
+Typed models for the Invoice Exception Handler OpenEnv environment.
+Every object the agent sees or produces is defined here as a Pydantic model.
+This is the single source of truth for the data contract between the
+environment simulation and the agent.
+"""
+from __future__ import annotations
+import time
+from enum import Enum
+from typing import Any, Dict, List, Optional
+from pydantic import BaseModel, Field
+class ActionType(str, Enum):
+    INSPECT_FIELD  = "inspect_field"
+    CROSS_CHECK    = "cross_check"
+    # ... etc
+```
+Do not put business logic in models.py. Just data shapes.
+---
+## Step 4 — env/tasks.py
+This is the biggest file. It defines what happens when the agent takes each action —
+the simulated responses, the rewards, and the grading logic.
+### EpisodeData class
+A plain Python class (not Pydantic) that tracks everything the agent has done in one episode.
+```python
+class EpisodeData:
+    """Tracks the full history of one episode for grading and state building."""
+    def __init__(self):
+        self.inspections: List[InspectionResult] = []
+        self.checks: List[CheckResult] = []
+        self.queries: List[QueryResult] = []
+        self.rules_applied: List[str] = []
+        self.decision: Optional[str] = None
+        self.decision_reason: Optional[str] = None
+        self.routed_to: List[str] = []
+        self.closed: bool = False
+        self.close_summary: Optional[str] = None
+        self.step_count: int = 0
+        self.cumulative_reward: float = 0.0
+    def has_inspected(self, doc: str, field: str) -> bool:
+        """Check if we already looked at this field in this document."""
+        return any(i.document == doc and i.field == field for i in self.inspections)
+    def has_checked(self, name: str) -> bool:
+        """Check if this validation check has already been run."""
+        return any(c.check_name == name for c in self.checks)
+    def has_queried(self, target: str) -> bool:
+        """Check if we already queried this person or department."""
+        return any(q.target == target for q in self.queries)
+```
+### BaseTask class
+Abstract base that all three tasks inherit from. Every method raises `NotImplementedError`.
+```python
+class BaseTask:
+    task_id: str = "base"
+    max_steps: int = 20
+    difficulty: str = "easy"
+    # Document factories — return fresh objects each time (no shared state)
+    def get_purchase_order(self) -> PurchaseOrder: raise NotImplementedError
+    def get_invoice(self) -> Invoice: raise NotImplementedError
+    def get_grn(self) -> GoodsReceiptNote: raise NotImplementedError
+    def get_supplier_master(self) -> SupplierMaster: raise NotImplementedError
+    def get_exception_flag(self) -> ExceptionFlag: raise NotImplementedError
+    # Simulators — each returns (result_object, reward_delta)
+    def simulate_inspect(self, document: str, field: str) -> Tuple[InspectionResult, float]: ...
+    def simulate_cross_check(self, field: str, doc_a: str, doc_b: str) -> Tuple[CheckResult, float]: ...
+    def simulate_run_check(self, check_name: str) -> Tuple[CheckResult, float]: ...
+    def simulate_query_supplier(self, question: str, channel: str) -> Tuple[QueryResult, float]: ...
+    def simulate_query_internal(self, department: str, question: str) -> Tuple[QueryResult, float]: ...
+    def simulate_apply_rule(self, rule_id: str) -> Tuple[str, float]: ...
+    def simulate_make_decision(self, decision: str, reason: str, ep: EpisodeData) -> float: ...
+    def simulate_route_to(self, team: str, notes: str, ep: EpisodeData) -> float: ...
+    def simulate_close(self, summary: str, ep: EpisodeData) -> float: ...
+    def grade(self, ep: EpisodeData) -> Dict[str, float]: ...
+    # These are properties, not methods
+    @property
+    def available_checks(self) -> List[str]: return []
+    @property
+    def available_rules(self) -> List[str]: return []
+    @property
+    def knowledge_base(self) -> List[str]: return []
+```
+### The Three Tasks
+#### Task 1: PriceVarianceTask (task1_price_variance)
+**The scenario:** An office stationery supplier sends an invoice that's 3.08% above the PO.
+Company policy allows ±2% automatic approval. Above that needs manual exception approval.
+The supplier did communicate the price increase but procurement never updated the PO.
+**task_id:** `"task1_price_variance"`
+**max_steps:** `18`
+**difficulty:** `"easy"`
+**The documents:**
+PO (PO-2024-1041): 3 stationery line items totalling ₹50,000
+- A4 Paper 100 reams @ ₹220 = ₹22,000
+- Ballpoint Pens 20 boxes @ ₹450 = ₹9,000
+- Staplers 10 units @ ₹1,900 = ₹19,000
+Invoice (INV-ON-8821): Same items, same quantities, but 2 items have higher unit prices
+- A4 Paper @ ₹231 (+₹11, +5.0%)
+- Ballpoint Pens @ ₹472 (+₹22, +4.9%)
+- Staplers unchanged @ ₹1,900
+- Subtotal: ₹51,540 (+₹1,540, +3.08%)
+- 18% GST applied correctly: ₹9,277.20
+- Total: ₹60,817.20
+GRN (GRN-2024-0892): All items fully received, no pending, no rejected.
+Supplier Master (SUP-0441 — OfficeNeed Supplies): Bank account and GSTIN both match invoice exactly. No fraud signals.
+Exception Flag: `PRICE_MISMATCH` — "Invoice total ₹51,540 exceeds PO ₹50,000 by ₹1,540 (3.08%). Above auto-approval threshold."
+**Knowledge base entries:**
+- POL-001: Price variance ≤±2% may be auto-approved. Above 2% requires exception approval.
+- POL-002: Exception approval requires confirmation from originating department.
+- POL-003: Any approved invoice with a price change must be followed by a PO amendment request.
+- POL-004: Bank account on invoice must match supplier master.
+**Simulator logic:**
+`simulate_inspect`: Return meaningful values for invoice line_items (+0.10), invoice total_amount (+0.08), po line_items (+0.06), grn items_received (+0.05). Return +0.01 for unknown fields.
+`simulate_cross_check`: The key cross-checks are:
+- `(unit_price, invoice, po)` → finds Paper and Pen mismatch, reward +0.12
+- `(total_amount, invoice, po)` → confirms 3.08% variance, reward +0.10
+- `(bank_account, invoice, supplier_master)` → match (no fraud), reward +0.03
+- `(gstin, invoice, supplier_master)` → match, reward +0.02
+- `(quantity, invoice, grn)` → match (full delivery), reward +0.04
+`simulate_run_check`:
+- `"tolerance_rule"` → 3.08% > 2%, FAILS, reward +0.14 (most important check)
+- `"grn_match"` → PASSES (all received), reward +0.06
+- `"duplicate_detection"` → PASSES (not a dup), reward +0.02
+- `"bank_account_verification"` → PASSES, reward +0.02
+- `"gst_verification"` → PASSES, reward +0.02
+- `"po_match"` → FAILS on price, reward +0.08
+`simulate_query_supplier`: Returns email from supplier explaining raw material price increase communicated to Arjun Mehta at procurement on Feb 20. Reward +0.10.
+`simulate_query_internal`:
+- `"procurement"` → Arjun Mehta confirms verbal approval, says he'll raise PO amendment. Reward +0.12.
+- Others → generic responses, reward +0.03.
+`simulate_apply_rule`:
+- `"tolerance_2pct_auto_approve"` → BLOCKED (3.08% > 2%), reward −0.05
+- `"tolerance_exception_approval"` → APPLIED, reward +0.10
+- `"rejection_with_reason"` → APPLIED but wrong, reward −0.08
+- `"partial_approval"` → not applicable here, reward −0.05
+`simulate_make_decision`:
+- `"approve"` with tolerance check + procurement query: reward +0.25
+- `"approve"` with tolerance check only: reward +0.18
+- `"approve"` with nothing checked: reward +0.05 (bad approval, should have verified)
+- `"reject"`: reward −0.10 (wrong decision, delay supplier)
+- `"hold"`: reward +0.08
+`simulate_route_to`:
+- `"procurement"` → reward +0.12 (correct — PO amendment needed)
+- `"finance"` → reward +0.03
+- `"legal"` → reward −0.05 (overkill for a price variance)
+`simulate_close`: reward +0.12 if approved + tolerance checked + procurement routed, else +0.06, else 0.
+**Grader (`grade` method):**
+```python
+def grade(self, ep: EpisodeData) -> Dict[str, float]:
+    checks_run = {c.check_name for c in ep.checks}
+    queries_to = {q.target for q in ep.queries}
+    # Did the agent correctly diagnose?
+    d = 0.0
+    if any("unit_price" in c.check_name or "total" in c.check_name
+           for c in ep.checks):
+        d += 0.12
+    if "tolerance_rule" in checks_run:
+        d += 0.14
+    if "grn_match" in checks_run:
+        d += 0.06
+    # Did the agent investigate properly?
+    i = 0.0
+    if "supplier" in queries_to:
+        i += 0.10
+    if "procurement" in queries_to:
+        i += 0.12
+    if "tolerance_exception_approval" in ep.rules_applied:
+        i += 0.08
+    # Correct decision?
+    dec = 0.0
+    if ep.decision == "approve":   dec += 0.18
+    elif ep.decision == "hold":    dec += 0.06
+    elif ep.decision == "reject":  dec -= 0.10
+    # Correct routing?
+    route = 0.12 if "procurement" in ep.routed_to else 0.0
+    # Closed cleanly?
+    closure = 0.08 if ep.closed else 0.0
+    # Efficiency bonus — penalise extra steps
+    eff = max(0.0, 0.06 - 0.004 * max(0, ep.step_count - 9))
+    total = d + i + dec + route + closure + eff
+    return {
+        "score": round(max(0.0, min(1.0, total)), 4),
+        "diagnosis_score": round(d, 4),
+        "investigation_score": round(i, 4),
+        "decision_score": round(dec, 4),
+        "routing_score": round(route, 4),
+        "closure_score": round(closure, 4),
+        "efficiency_score": round(eff, 4),
+    }
+```
+---
+#### Task 2: DuplicateTaxErrorTask (task2_duplicate_tax)
+**The scenario:** Logistics supplier submits INV-2024-891 for transport services. System flags
+it as a possible duplicate. Turns out it IS a duplicate of INV-2024-819 — the numbers differ
+by digit transposition (891 vs 819). That original invoice was already paid. BUT: the original
+invoice applied 15% GST when the correct rate is 18%. The company overpaid ₹3,240 in tax.
+The new invoice has the correct rate. So it's both a duplicate AND a legitimate correction.
+**task_id:** `"task2_duplicate_tax"`
+**max_steps:** `20`
+**difficulty:** `"medium"`
+**The documents:**
+PO (PO-2024-0778): Logistics services
+- Mumbai-Pune Transport 20 trips @ ₹4,500 = ₹90,000
+- Warehousing charges Feb 2024 @ ₹18,000 = ₹18,000
+- Total: ₹1,08,000, Net-15 terms
+Invoice (INV-2024-891): Same services, same amounts — correct on the face of it
+- Subtotal: ₹1,08,000
+- GST 18%: ₹19,440 ← this is CORRECT
+- Total: ₹1,27,440
+GRN (GRN-2024-0740): Services confirmed complete (transport + warehousing).
+Supplier Master (SUP-0229 — FastMove Logistics): Bank and GSTIN match invoice. No fraud signals.
+Exception Flag: `POSSIBLE_DUPLICATE` — "Invoice INV-2024-891 closely matches previously processed invoice."
+**Hidden state (not in documents, revealed by checks):**
+- INV-2024-819 was paid 12 days ago for ₹1,24,200
+- INV-2024-819 applied 15% GST = ₹16,200 (wrong rate)
+- Correct 18% GST = ₹19,440
+- Company overpaid: ₹3,240
+**Key checks and what they reveal:**
+`run_check("duplicate_detection")` → FAILS → finds INV-2024-819 paid 12 days ago, reward +0.18
+`run_check("tax_calculation_verify")` → FAILS → discovers the 15% error on original, reveals ₹3,240 delta, reward +0.16
+`cross_check(invoice_number, invoice, payment_history)` → finds digit transposition, reward +0.15
+`cross_check(tax_amount, invoice, payment_history)` → confirms ₹3,240 delta, reward +0.14
+`query_internal("finance")` → confirms overpayment on original, reward +0.12
+`query_supplier` → supplier confirms they know and wants partial approval for the delta, reward +0.10
+`apply_rule("partial_approval")` → correct pathway, reward +0.12
+`apply_rule("credit_note_request")` → supplier must issue credit note for the balance, reward +0.10
+**Decision logic:**
+`simulate_make_decision`:
+- `"partial_approve"` with dup + tax found: reward +0.28 ← optimal
+- `"partial_approve"` with dup only: reward +0.14 ← incomplete
+- `"reject"` with dup found: reward +0.08 ← catches dup, misses correction
+- `"approve"` (pays full duplicate): reward −0.15 ← bad
+**Grader weights:**
+- diagnosis_score: up to 0.30 (dup found +0.16, tax error found +0.14)
+- investigation_score: up to 0.32 (finance queried, supplier queried, rules applied)
+- decision_score: up to 0.20 (partial_approve = 0.20, reject = 0.05, approve = −0.15)
+- routing_score: up to 0.08
+- closure_score: up to 0.06
+---
+#### Task 3: CompoundFraudTask (task3_compound_fraud)
+**The scenario:** IT supplier submits ₹8,47,500 invoice for 15 laptops. System flags a bank
+account change. But there are FOUR simultaneous fraud signals that the agent must find all of.
+**task_id:** `"task3_compound_fraud"`
+**max_steps:** `25`
+**difficulty:** `"hard"`
+**The four signals:**
+1. **Bank account fraud (Signal 1):** Invoice has a different bank account than the supplier
+   master. The change request came from `techcore-solutions.com`. The registered domain is
+   `techcore-solutions.in`. Classic Business Email Compromise (BEC) attack.
+2. **GSTIN fraud (Signal 2):** The GST number on the invoice (`07AABCT9999X1Z8`) belongs to
+   "TechCore Trading Pvt Ltd" — a completely different entity in Delhi. Supplier master shows
+   `07AABCT1234Y1Z5` for "TechCore Solutions."
+3. **Quantity mismatch (Signal 3):** Invoice claims 15 laptops. GRN shows only 13 received.
+   2 units are still marked as pending.
+4. **Price inflation (Signal 4):** ₹56,500/unit on invoice vs ₹52,000/unit on PO. That's
+   8.65% above the agreed price. No price revision was ever approved.
+**Bonus signals (smaller, still notable):**
+- Invoice is dated a Sunday (2024-03-10) — unusual for B2B
+- PO was raised Friday March 8 — 2-day turnaround is suspiciously fast for IT equipment
+**The critical trap — channel selection:**
+`simulate_query_supplier(question, channel="email")` →
+Returns fraudster's response urging payment to the new account. Reward: **−0.15**.
+`simulate_query_supplier(question, channel="phone")` →
+The real TechCore Solutions confirms they sent no bank change request. Confirms fraud. Reward: **+0.15**.
+This tests whether the agent follows POL-009 ("bank account change must be verified via
+registered phone number — NEVER via email") which is in the knowledge base.
+**Available checks and rewards:**
+```python
+"bank_account_verification"  → FAILS, finds lookalike domain, reward +0.18
+"gst_verification"           → FAILS, GST belongs to different entity, reward +0.18
+"grn_match"                  → FAILS, 13 vs 15 received, reward +0.14
+"email_domain_verification"  → FAILS, lookalike domain confirmed, reward +0.16
+"invoice_date_validation"    → FAILS, Sunday flag, reward +0.08
+"quantity_check"             → FAILS, quantity inflated, reward +0.12
+"price_check"                → FAILS, 8.65% above PO, reward +0.10
+"duplicate_detection"        → PASSES (not a dup), reward +0.02
+"po_match"                   → FAILS (GST + qty + price all wrong), reward +0.08
+```
+**Decision logic:**
+`simulate_make_decision`:
+- `"reject"` → reward = 0.10 + 0.05 × (number of signals found) → max ~0.30
+- `"approve"` → reward −0.40 (catastrophic — approved fraud)
+- `"partial_approve"` → reward −0.20 (you can't partially approve fraud)
+- `"hold"` → reward = 0.08 + 0.03 × signals found → acceptable but not optimal
+**Route logic:**
+```python
+"legal"     → reward +0.14  # must escalate to legal
+"security"  → reward +0.12  # BEC attack needs security investigation
+"finance"   → reward +0.08  # finance needs to block payment
+"procurement" → reward +0.06
+```
+**Grader — the signal detection scoring:**
+```python
+def grade(self, ep: EpisodeData) -> Dict[str, float]:
+    failed = {c.check_name for c in ep.checks if not c.passed}
+    bank_found   = "bank_account_verification" in {c.check_name for c in ep.checks}
+    gst_found    = "gst_verification" in {c.check_name for c in ep.checks}
+    qty_found    = "grn_match" in {c.check_name for c in ep.checks}
+    domain_found = "email_domain_verification" in {c.check_name for c in ep.checks}
+    price_found  = "price_check" in {c.check_name for c in ep.checks}
+    # Diagnosis — finding all signals is the whole point
+    d = (0.12 if bank_found else 0) + (0.12 if gst_found else 0) \
+      + (0.10 if qty_found else 0)  + (0.10 if domain_found else 0) \
+      + (0.06 if price_found else 0)
+    # Investigation — reward for using phone not email
+    i = 0.0
+    for q in ep.queries:
+        if q.target == "supplier" and q.channel not in ("email", "mail"):
+            i += 0.10   # correct channel
+        elif q.target == "supplier" and q.channel in ("email", "mail"):
+            i -= 0.15   # contacting fraudster
+    if "legal" in {q.target for q in ep.queries}:    i += 0.06
+    if "security" in {q.target for q in ep.queries}: i += 0.06
+    # Decision
+    signals = sum([bank_found, gst_found, qty_found, domain_found])
+    dec = 0.0
+    if ep.decision == "reject":
+        dec = 0.08 + 0.03 * signals
+    elif ep.decision == "approve":
+        dec = -0.35
+    elif ep.decision == "partial_approve":
+        dec = -0.15
+    elif ep.decision == "hold":
+        dec = 0.06
+    # Routing
+    routes = set(ep.routed_to)
+    route = (0.10 if "legal" in routes else 0) \
+          + (0.06 if "security" in routes else 0) \
+          + (0.04 if "finance" in routes else 0)
+    closure = 0.06 if (ep.closed and ep.decision == "reject") else 0.0
+    eff = max(0.0, 0.04 - 0.002 * max(0, ep.step_count - 12))
+    total = d + i + dec + route + closure + eff
+    return {
+        "score": round(max(0.0, min(1.0, total)), 4),
+        "signals_found": sum([bank_found, gst_found, qty_found, domain_found, price_found]),
+        "diagnosis_score": round(d, 4),
+        "investigation_score": round(i, 4),
+        "decision_score": round(dec, 4),
+        "routing_score": round(route, 4),
+        "closure_score": round(closure, 4),
+        "efficiency_score": round(eff, 4),
+    }
+```
+### Task Registry
+At the bottom of tasks.py:
+```python
+TASK_REGISTRY: Dict[str, type] = {
+    "task1_price_variance": PriceVarianceTask,
+    "task2_duplicate_tax":  DuplicateTaxErrorTask,
+    "task3_compound_fraud": CompoundFraudTask,
+}
+ALL_TASKS = list(TASK_REGISTRY.keys())
+def make_task(task_id: str) -> BaseTask:
+    cls = TASK_REGISTRY.get(task_id)
+    if cls is None:
+        raise ValueError(f"Unknown task '{task_id}'. Available: {ALL_TASKS}")
+    return cls()
+```
+---
+## Step 5 — env/environment.py
+This is the `InvoiceExceptionEnv` class. It is the only thing external code needs to import.
+```python
+class InvoiceExceptionEnv:
+    """
+    OpenEnv-compatible Invoice Exception Handler environment.
+    Usage:
+        env = InvoiceExceptionEnv(seed=42)
+        obs = env.reset("task1_price_variance")
+        result = env.step(Action.run_check("tolerance_rule"))
+        scores = env.grade()
+    """
+```
+### Constructor
+Takes an optional `seed: Optional[int] = None` for reproducibility.
+Initialises `self._rng = random.Random(seed)`.
+Initialises `self._task`, `self._ep`, `self._state`, `self._done` all to None/False.
+### reset(task_id)
+```python
+def reset(self, task_id: Optional[str] = None) -> EnvironmentState:
+    """
+    Start a new episode. If task_id is None, picks one at random.
+    Returns the initial EnvironmentState showing all documents and available actions.
+    """
+```
+1. Pick task (random if None)
+2. Create `EpisodeData()`
+3. Set `self._done = False`
+4. Call `self._build_state()` and store result
+5. Return the state
+### step(action)
+```python
+def step(self, action: Union[Action, Dict[str, Any]]) -> StepResult:
+    """
+    Execute one action. Returns observation, reward, done flag, and info dict.
+    Raises RuntimeError if called before reset() or after the episode is done.
+    """
+```
+1. Validate we're in an active episode
+2. Convert dict to Action if needed
+3. Call `self._dispatch(action)` → gets (reward, info)
+4. Increment step count
+5. Check SLA (step count vs max_steps)
+6. Check done condition (closed or SLA breach)
+7. Rebuild state
+8. Return StepResult
+### state()
+Non-destructive. Just returns `self._state`. Raises RuntimeError if not initialised.
+### grade()
+Calls `self._task.grade(self._ep)` and returns the dict.
+### _dispatch(action)
+The routing function. A single if/elif chain for each ActionType.
+For each action:
+1. Call the appropriate task simulator
+2. Update EpisodeData
+3. Return (reward, info dict)
+Handle repeated actions (inspect same field twice, check same thing twice) with a small −0.02 to −0.05 penalty and return early.
+### _build_state()
+Constructs an `EnvironmentState` from the current `_task` and `_ep`. Called after every step.
+Also determines the current `CaseStatus` based on episode data.
+### action_space_sample()
+Returns a random valid action (for random baseline agents). Uses `self._rng` for reproducibility.
+---
+## Step 6 — env/__init__.py
+```python
+from .environment import InvoiceExceptionEnv
+from .models import Action, ActionType, EnvironmentState, StepResult
+from .tasks import ALL_TASKS, make_task
+__all__ = [
+    "InvoiceExceptionEnv",
+    "Action",
+    "ActionType",
+    "EnvironmentState",
+    "StepResult",
+    "ALL_TASKS",
+    "make_task",
+]
+```
+---
+## Step 7 — Smoke Test Before Continuing
+Before writing openenv.yaml or inference.py, verify the environment works.
+```python
+# test_smoke.py — run this, do not commit it
+from env import InvoiceExceptionEnv, Action, ALL_TASKS
+print("Tasks:", ALL_TASKS)
+env = InvoiceExceptionEnv(seed=42)
+for task_id in ALL_TASKS:
+    obs = env.reset(task_id)
+    print(f"\n--- {task_id} ---")
+    print("Ticket:", obs.exception_flag.flag_description[:80])
+    # Take a few actions
+    r1 = env.step(Action.run_check(obs.available_checks[0]))
+    print(f"Step 1 reward: {r1.reward}")
+    r2 = env.step(Action.make_decision("approve", "test"))
+    print(f"Step 2 reward: {r2.reward}")
+    r3 = env.step(Action.close_case("closed"))
+    print(f"Step 3 reward: {r3.reward}, done: {r3.done}")
+    scores = env.grade()
+    print(f"Grade: {scores['score']}")
+print("\nSmoke test passed.")
+```
+All three tasks must complete without errors. Scores must be in [0.0, 1.0].
+---
+## Step 8 — openenv.yaml
+This file must pass `openenv validate`. Write it carefully.
+```yaml
+# openenv.yaml
+name: Invoice Exception Handler
+version: "1.0.0"
+description: |
+  An agent learning environment simulating accounts payable exception handling.
+  The agent acts as an AP analyst: investigates flagged invoices, applies business
+  rules, detects fraud signals, makes decisions, and closes cases with an audit trail.
+authors:
+  - name: Your Name
+    email: your@email.com
+license: MIT
+tasks:
+  - id: task1_price_variance
+    name: Price Variance Exception
+    difficulty: easy
+    description: |
+      Office stationery invoice arrives 3.08% above PO. Company tolerance policy
+      allows ±2% auto-approval. Agent must detect the variance, verify through
+      the tolerance rule, confirm verbal approval with procurement, and approve
+      with a PO amendment request.
+    max_steps: 18
+    optimal_score: 1.0
+    min_passing_score: 0.60
+  - id: task2_duplicate_tax
+    name: Duplicate Invoice with Tax Error
+    difficulty: medium
+    description: |
+      Logistics supplier submits INV-2024-891, a duplicate of paid INV-2024-819
+      (digit transposition: 891 vs 819). Original invoice had wrong GST rate (15%
+      vs correct 18%) — company overpaid ₹3,240. New invoice has correct rate.
+      Agent must detect the duplicate, identify the tax error in the original,
+      and partially approve only the ₹3,240 tax correction.
+    max_steps: 20
+    optimal_score: 1.0
+    min_passing_score: 0.50
+  - id: task3_compound_fraud
+    name: Compound Fraud Signals
+    difficulty: hard
+    description: |
+      IT equipment supplier invoice with four simultaneous fraud signals: bank
+      account changed via BEC attack (lookalike email domain), GSTIN belongs to
+      a different entity, 2 of 15 laptops not yet received, and unit price 8.65%
+      above PO. Agent must find all signals, use the correct communication channel
+      (phone, not email — which would contact the fraudster), and escalate to legal
+      and security.
+    max_steps: 25
+    optimal_score: 1.0
+    min_passing_score: 0.40
+observation_space:
+  type: object
+  description: EnvironmentState Pydantic model
+  fields:
+    task_id:             {type: string}
+    step_number:         {type: integer}
+    case_status:         {type: string, enum: [open, in_review, decided, routed, closed]}
+    purchase_order:      {type: object, description: "PO with line items and terms"}
+    invoice:             {type: object, description: "Supplier invoice with line items and tax"}
+    grn:                 {type: object, description: "Goods receipt — what actually arrived"}
+    supplier_master:     {type: object, description: "Verified supplier record"}
+    exception_flag:      {type: object, description: "Why the system flagged this invoice"}
+    inspections:         {type: array, description: "Fields the agent has inspected"}
+    checks_run:          {type: array, description: "Validation checks completed"}
+    queries:             {type: array, description: "Internal and supplier queries"}
+    rules_applied:       {type: array, description: "Business rules applied"}
+    decision:            {type: string, nullable: true}
+    routed_to:           {type: array}
+    available_actions:   {type: array}
+    available_checks:    {type: array}
+    available_rules:     {type: array}
+    knowledge_base:      {type: array}
+    cumulative_reward:   {type: number}
+action_space:
+  type: object
+  description: Action with type and params
+  actions:
+    inspect_field:
+      params: {document: string, field: string}
+    cross_check:
+      params: {field: string, doc_a: string, doc_b: string}
+    run_check:
+      params: {check_name: string}
+    query_supplier:
+      params: {question: string, channel: string}
+    query_internal:
+      params: {department: string, question: string}
+    apply_rule:
+      params: {rule_id: string}
+    make_decision:
+      params: {decision: string, reason: string}
+    route_to:
+      params: {team: string, notes: string}
+    close_case:
+      params: {summary: string}
+reward:
+  range: [-1.0, 1.0]
+  description: |
+    Shaped reward at every step. Relevant inspections: +0.01 to +0.14.
+    Diagnostics revealing issues: +0.08 to +0.18. Correct fixes: +0.08 to +0.30.
+    Wrong decision on fraud: -0.15 to -0.40. Repeat actions: -0.02 to -0.05.
+    SLA breach: -0.10.
+grading:
+  method: task_grader
+  scores:
+    - score           # 0.0–1.0 overall
+    - diagnosis_score
+    - investigation_score
+    - decision_score
+    - routing_score
+    - closure_score
+    - efficiency_score
+api:
+  reset:
+    signature: "reset(task_id: str | None = None) -> EnvironmentState"
+  step:
+    signature: "step(action: Action | dict) -> StepResult"
+  state:
+    signature: "state() -> EnvironmentState"
+  grade:
+    signature: "grade() -> Dict[str, float]"
+http_endpoints:
+  - path: /reset
+    method: POST
+    description: Reset environment, returns EnvironmentState JSON
+  - path: /step
+    method: POST
+    description: Execute action, returns StepResult JSON
+  - path: /state
+    method: GET
+    description: Current state, returns EnvironmentState JSON
+  - path: /grade
+    method: POST
+    description: Grade current episode
+  - path: /health
+    method: GET
+    description: Health check
+dependencies:
+  python: ">=3.11"
+  packages:
+    - pydantic==2.7.1
+    - fastapi==0.111.0
+    - uvicorn==0.29.0
+    - gradio==4.36.1
+    - openai==1.35.3
+    - pyyaml==6.0.1
+docker:
+  port: 7860
+  health_check: /health
+```
+---
+## Step 9 — inference.py
+This is the most critical file for the hackathon validator. Get the format exactly right.
+### Required env vars
+```python
+API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
+MODEL_NAME   = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
+API_KEY      = os.getenv("HF_TOKEN") or os.getenv("API_KEY", "")
+```
+### Required stdout format
+Every line to stdout must be exactly:
+```
+[START] task=<task_id> env=invoice-exception-handler model=<model_name>
+[STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
+[END] success=<true|false> steps=<n> score=<0.000> rewards=<r1,r2,...>
+```
+Rules (do not deviate):
+- One `[START]` line at episode begin
+- One `[STEP]` line per step, immediately after `env.step()` returns
+- One `[END]` line after the episode, always emitted even on exception
+- `reward` and all values in `rewards` formatted to exactly 2 decimal places
+- `score` formatted to exactly 3 decimal places
+- `done` and `success` are lowercase: `true` or `false`
+- `error` is the error message string, or exactly `null` if none
+- No newlines within a single line
+- `flush=True` on every print so the validator sees output in real time
+### System prompt for the LLM
+Write a clear system prompt that tells the model:
+- It is an AP analyst handling a flagged invoice
+- It has a structured action space (list all 9 action types)
+- It must respond in JSON: `{"type": "...", "params": {...}}`
+- It should investigate before deciding
+- Never approve without checking, never contact supplier by email if fraud is suspected
+- Available documents: PO, Invoice, GRN, Supplier Master, Exception Flag
+### User prompt per step
+Include in the user prompt:
+- Current step number and max steps
+- The exception flag (what was flagged and why)
+- Available checks (list them)
+- Available rules (list them)
+- Knowledge base entries (the policy list)
+- What has been done so far (checks run, queries made, inspections done)
+- Current cumulative reward
+- Ask for next action as JSON
+### Parsing LLM output
+```python
+def parse_action(raw_text: str) -> dict:
+    """
+    Parse the model's response into an action dict.
+    Handles markdown code fences, extra whitespace, and minor formatting errors.
+    Falls back to run_check(po_match) if parsing fails.
+    """
+    text = raw_text.strip()
+    # Remove ```json or ``` fences if present
+    if text.startswith("```"):
+        lines = text.split("\n")
+        text = "\n".join(lines[1:-1] if lines[-1] == "```" else lines[1:])
+    try:
+        return json.loads(text.strip())
+    except json.JSONDecodeError:
+        # Try to find JSON within the text
+        import re
+        match = re.search(r'\{.*\}', text, re.DOTALL)
+        if match:
+            try:
+                return json.loads(match.group())
+            except json.JSONDecodeError:
+                pass
+    # Safe fallback
+    return {"type": "run_check", "params": {"check_name": "po_match"}}
+```
+### Overall structure
+```python
+def run_task(client, env, task_id, max_steps=20):
+    """Run one task episode and return (steps_taken, score, rewards)."""
+    rewards = []
+    print(f"[START] task={task_id} env=invoice-exception-handler model={MODEL_NAME}", flush=True)
+    obs = env.reset(task_id)
+    history = []
+    for step in range(1, max_steps + 1):
+        # Build prompt from observation
+        user_prompt = build_prompt(obs, step, max_steps, history)
+        # Call LLM
+        raw = call_llm(client, user_prompt)
+        action_dict = parse_action(raw)
+        # Execute
+        try:
+            result = env.step(action_dict)
+            reward = result.reward
+            done = result.done
+            error = None
+        except Exception as e:
+            reward = 0.0
+            done = False
+            error = str(e)
+            result = None
+        rewards.append(reward)
+        action_str = json.dumps(action_dict)
+        print(
+            f"[STEP] step={step} action={action_str} "
+            f"reward={reward:.2f} done={str(done).lower()} "
+            f"error={error or 'null'}",
+            flush=True
+        )
+        history.append(f"Step {step}: {action_str} → reward {reward:+.2f}")
+        if result:
+            obs = result.observation
+        if done:
+            break
+    score = env.grade()["score"]
+    success = score >= 0.5
+    steps_taken = min(step, max_steps)
+    rewards_str = ",".join(f"{r:.2f}" for r in rewards)
+    print(
+        f"[END] success={str(success).lower()} steps={steps_taken} "
+        f"score={score:.3f} rewards={rewards_str}",
+        flush=True
+    )
+    return steps_taken, score, rewards
+def main():
+    client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
+    env = InvoiceExceptionEnv(seed=42)
+    for task_id in ALL_TASKS:
+        run_task(client, env, task_id)
+if __name__ == "__main__":
+    main()
+```
+---
+## Step 10 — app.py
+The app.py serves two purposes:
+1. Provides the FastAPI HTTP endpoints that the validator pings (`POST /reset` must return 200)
+2. Provides a Gradio UI for interactive exploration on HF Spaces
+### Architecture
+Run both FastAPI and Gradio in the same process on port 7860.
+Use `gr.mount_gradio_app` to mount Gradio on FastAPI, or run Gradio alongside FastAPI.
+The cleanest approach:
+```python
+import gradio as gr
+from fastapi import FastAPI
+from fastapi.responses import JSONResponse
+import uvicorn
+app = FastAPI(title="Invoice Exception Handler OpenEnv")
+env = InvoiceExceptionEnv(seed=42)  # shared environment instance
+@app.post("/reset")
+async def http_reset(body: dict = {}):
+    task_id = body.get("task_id", None)
+    obs = env.reset(task_id)
+    return JSONResponse(obs.model_dump(mode="json"))
+@app.post("/step")
+async def http_step(body: dict):
+    result = env.step(body)
+    return JSONResponse(result.model_dump(mode="json"))
+@app.get("/state")
+async def http_state():
+    return JSONResponse(env.state().model_dump(mode="json"))
+@app.post("/grade")
+async def http_grade():
+    return JSONResponse(env.grade())
+@app.get("/tasks")
+async def http_tasks():
+    return JSONResponse(ALL_TASKS)
+@app.get("/health")
+async def health():
+    return JSONResponse({"status": "ok", "version": "1.0.0"})
+# Mount Gradio on /ui
+gradio_app = build_gradio_ui()
+app = gr.mount_gradio_app(app, gradio_app, path="/")
+```
+### Gradio UI — what to build
+Keep the UI simple and functional. Three tabs:
+**Tab 1: Manual Play**
+- Dropdown to select task (labels: "Task 1 — Price Variance (Easy)", etc.)
+- Reset button
+- Shows the exception flag, the key document fields, and available actions
+- Dropdown or textbox to compose and submit an action
+- Shows reward, cumulative reward, and status after each step
+- Shows grade breakdown when episode ends
+**Tab 2: Agent Demo**
+- Select task
+- Shows a hardcoded optimal action sequence running step by step
+- Good for demonstrating the environment to judges who won't run code
+**Tab 3: API Reference**
+- Code examples for each action type
+- Reward table
+- Grader score breakdown explanation
+---
+## Step 11 — Dockerfile
+```dockerfile
+FROM python:3.11-slim
+# Install system dependencies
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends curl \
+    && rm -rf /var/lib/apt/lists/*
+# Create non-root user (required by HF Spaces)
+RUN useradd -m -u 1000 appuser
+WORKDIR /app
+# Copy and install dependencies first (layer caching)
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy application code
+COPY --chown=appuser:appuser . .
+USER appuser
+EXPOSE 7860
+# Health check — pings the /health endpoint
+HEALTHCHECK --interval=30s --timeout=10s --start-period=20s --retries=3 \
+    CMD curl -f http://localhost:7860/health || exit 1
+ENV PYTHONUNBUFFERED=1
+ENV GRADIO_SERVER_NAME=0.0.0.0
+ENV GRADIO_SERVER_PORT=7860
+CMD ["python", "app.py"]
+```
+---
+## Step 12 — End-to-End Test Checklist
+Before pushing, check every item in this list.
+```bash
+# 1. Imports work
+python -c "from env import InvoiceExceptionEnv, Action, ALL_TASKS; print('OK')"
+# 2. All three tasks complete without errors
+python -c "
+from env import InvoiceExceptionEnv, Action, ALL_TASKS
+env = InvoiceExceptionEnv(seed=42)
+for t in ALL_TASKS:
+    obs = env.reset(t)
+    result = env.step(Action.run_check(obs.available_checks[0]))
+    result = env.step(Action.make_decision('reject', 'test'))
+    result = env.step(Action.close_case('test'))
+    score = env.grade()['score']
+    assert 0.0 <= score <= 1.0, f'Score out of range: {score}'
+    print(f'{t}: {score}')
+print('All tasks OK')
+"
+# 3. Graders are deterministic
+python -c "
+from env import InvoiceExceptionEnv, Action
+env1 = InvoiceExceptionEnv(seed=42)
+env2 = InvoiceExceptionEnv(seed=42)
+obs1 = env1.reset('task1_price_variance')
+obs2 = env2.reset('task1_price_variance')
+env1.step(Action.run_check('tolerance_rule'))
+env2.step(Action.run_check('tolerance_rule'))
+env1.step(Action.make_decision('approve', 'test'))
+env2.step(Action.make_decision('approve', 'test'))
+env1.step(Action.close_case('done'))
+env2.step(Action.close_case('done'))
+s1 = env1.grade()['score']
+s2 = env2.grade()['score']
+assert s1 == s2, f'Non-deterministic: {s1} vs {s2}'
+print(f'Deterministic: {s1}')
+"
+# 4. inference.py log format (with fake API key)
+API_BASE_URL=https://api.example.com HF_TOKEN=fake MODEL_NAME=test python -c "
+# This will fail on the API call but should print [START] before failing
+import subprocess, sys
+"
+# Manually verify the [START] line would print correctly
+# 5. Docker builds
+docker build -t invoice-env-test .
+# 6. Docker runs and /health returns 200
+docker run -d -p 7860:7860 --name test-env invoice-env-test
+sleep 15
+curl -f http://localhost:7860/health
+curl -s -X POST http://localhost:7860/reset -H "Content-Type: application/json" -d '{}'
+docker stop test-env && docker rm test-env
+# 7. openenv validate (if openenv-core is installed)
+pip install openenv-core
+openenv validate
+```
+---
+## Step 13 — documents/ Folder
+Create these four files. Keep them updated as the project evolves.
+### documents/CHANGELOG.md
+```markdown
+# Changelog
+All changes to the Invoice Exception Handler environment are recorded here.
+Format: Date | Version | What changed | Why
+---
+## [1.0.0] — 2025-01-20
+### Added
+- Initial implementation of InvoiceExceptionEnv with full OpenEnv API
+- Three tasks: task1_price_variance, task2_duplicate_tax, task3_compound_fraud
+- Pydantic v2 typed models for all environment objects
+- FastAPI HTTP endpoints for HF Spaces validation
+- Gradio UI for interactive exploration
+- inference.py using OpenAI client with [START][STEP][END] log format
+- openenv.yaml spec file
+- Dockerfile for HF Spaces deployment
+### Design decisions
+- Used pure Python simulation (no external databases) for portability and determinism
+- Compound fraud task has four signals to prevent simple greedy agents from scoring well
+- Channel selection in Task 3 (phone vs email) tests policy knowledge, not just anomaly detection
+- Grader uses sub-scores to allow partial credit for partial solutions
+```
+### documents/ARCHITECTURE.md
+Document the system architecture. Include:
+- A text diagram of how the components connect
+- Why FastAPI and Gradio in the same process (HF Spaces constraint)
+- Why Pydantic v2 (spec requirement, validation)
+- How EpisodeData separates mutable state from immutable document context
+- Why tasks are separate classes (easy to extend)
+### documents/BASELINE-SCORES.md
+Record the reproducible baseline scores. Run them yourself and copy the output here.
+```markdown
+# Baseline Scores
+Recorded on: 2025-01-20
+Seed: 42
+Machine: 2 vCPU, 8GB RAM
+## Random Agent (action_space_sample())
+| Task | Score | Steps |
+|------|-------|-------|
+| task1_price_variance | ~0.18 | 18 (SLA breach) |
+| task2_duplicate_tax  | ~0.12 | 20 (SLA breach) |
+| task3_compound_fraud | ~0.08 | 25 (SLA breach) |
+| **Average** | **~0.13** | |
+## Optimal Agent (hardcoded correct actions)
+| Task | Score | Steps |
+|------|-------|-------|
+| task1_price_variance | ~0.98 | 9  |
+| task2_duplicate_tax  | ~0.95 | 10 |
+| task3_compound_fraud | ~0.92 | 14 |
+| **Average** | **~0.95** | |
+```
+---
+## Step 14 — Push and Verify
+```bash
+# Final commit
+git add .
+git commit -m "feat: complete invoice exception handler v1.0.0
+- 3 tasks with deterministic graders (easy/medium/hard)
+- Full OpenEnv API: reset/step/state/grade
+- FastAPI HTTP endpoints for validator (/reset, /step, /state, /health)
+- Gradio UI for HF Spaces
+- inference.py with OpenAI client and [START][STEP][END] format
+- openenv.yaml spec
+- Dockerfile for HF Spaces deployment
+- documents/ folder with PRD, changelog, architecture, baseline scores"
+git push origin main
+# Deploy to HF Spaces (if not using git-based deployment)
+# The Dockerfile and app.py handle this automatically when pushed to HF
+```
+---
+## Action Space Reference
+| Action Type | Required Params | Description |
+|---|---|---|
+| `inspect_field` | `document, field` | Look at a specific field in a document |
+| `cross_check` | `field, doc_a, doc_b` | Compare a field between two documents |
+| `run_check` | `check_name` | Run a named validation check |
+| `query_supplier` | `question, channel` | Ask the supplier something (channel: phone or email) |
+| `query_internal` | `department, question` | Ask an internal team |
+| `apply_rule` | `rule_id` | Apply a business policy rule |
+| `make_decision` | `decision, reason` | approve / reject / hold / partial_approve |
+| `route_to` | `team, notes` | Escalate to a team |
+| `close_case` | `summary` | Close with an audit trail summary |
+---
+## Observation Space Reference
+| Field | Type | Description |
+|---|---|---|
+| `task_id` | str | Which task is running |
+| `step_number` | int | Current step |
+| `case_status` | str | open / in_review / decided / routed / closed |
+| `purchase_order` | PurchaseOrder | What was agreed to be purchased |
+| `invoice` | Invoice | What the supplier is claiming |
+| `grn` | GoodsReceiptNote | What actually arrived |
+| `supplier_master` | SupplierMaster | Verified supplier record |
+| `exception_flag` | ExceptionFlag | Why this invoice was flagged |
+| `inspections` | List | Fields already inspected |
+| `checks_run` | List | Validation checks already run |
+| `queries` | List | Queries made and responses |
+| `rules_applied` | List | Business rules applied |
+| `decision` | str? | Current decision if made |
+| `routed_to` | List | Teams this case has been escalated to |
+| `available_actions` | List | All 9 action types |
+| `available_checks` | List | Check names valid for this task |
+| `available_rules` | List | Rule IDs valid for this task |
+| `knowledge_base` | List | Policy entries relevant to this task |
+| `cumulative_reward` | float | Sum of all rewards so far |
+---
+## Reward Reference
+| Event | Reward |
+|---|---|
+| Inspecting a key field that reveals an anomaly | +0.08 to +0.14 |
+| Inspecting a routine field | +0.01 to +0.06 |
+| Cross-check that finds a mismatch | +0.12 to +0.15 |
+| Running a check that finds an issue | +0.08 to +0.18 |
+| Querying the right person | +0.04 to +0.12 |
+| Contacting supplier via wrong channel (Task 3) | −0.15 |
+| Applying the correct business rule | +0.08 to +0.12 |
+| Applying the wrong rule | −0.05 to −0.10 |
+| Correct decision (approve/reject/partial) | +0.18 to +0.28 |
+| Approving a fraudulent invoice | −0.35 to −0.40 |
+| Wrong rejection (task1) | −0.10 |
+| Routing to the right team | +0.06 to +0.14 |
+| Clean case closure | +0.06 to +0.12 |
+| Repeat action | −0.02 to −0.05 |
+| SLA breach (exceed max_steps) | −0.10 |
+---
+## Expected Baseline Scores
+These are the scores you should see when running `inference.py` with a good LLM.
+| Task | Difficulty | Random Agent | Rule Agent | LLM Agent (Qwen-72B) |
+|---|---|---|---|---|
+| task1_price_variance | Easy | ~0.18 | ~0.85 | ~0.80 |
+| task2_duplicate_tax | Medium | ~0.12 | ~0.72 | ~0.68 |
+| task3_compound_fraud | Hard | ~0.08 | ~0.55 | ~0.45 |
+The hard task should be genuinely hard for LLMs — a score of 0.45 is expected, not a failure.
+---
+## Environment Variables
+| Variable | Required | Default | Description |
+|---|---|---|---|
+| `API_BASE_URL` | Yes | `https://router.huggingface.co/v1` | LLM endpoint |
+| `MODEL_NAME` | Yes | `Qwen/Qwen2.5-72B-Instruct` | Model to use |
+| `HF_TOKEN` | Yes | — | API key for the LLM endpoint |
+| `ANTHROPIC_API_KEY` | No | — | Only if using Anthropic models directly |
+---
+## Setup Instructions
+### Local Development
+```bash
+# Clone the repo
+git clone https://github.com/YOUR_USERNAME/invoice-exception-handler.git
+cd invoice-exception-handler
+# Create virtual environment
+python -m venv venv
+source venv/bin/activate   # Windows: venv\Scripts\activate
+# Install dependencies
+pip install -r requirements.txt
+# Run the app locally
+python app.py
+# Visit http://localhost:7860
+```
+### Run Inference
+```bash
+export API_BASE_URL="https://router.huggingface.co/v1"
+export MODEL_NAME="Qwen/Qwen2.5-72B-Instruct"
+export HF_TOKEN="your-token-here"
+python inference.py
+```
+### Docker
+```bash
+docker build -t invoice-exception-handler .
+docker run -p 7860:7860 \
+  -e API_BASE_URL="https://router.huggingface.co/v1" \
+  -e MODEL_NAME="Qwen/Qwen2.5-72B-Instruct" \
+  -e HF_TOKEN="your-token-here" \
+  invoice-exception-handler
+```
+### HF Spaces Deployment
+1. Create a new Space with the Gradio SDK
+2. Push this repository to it
+3. Add secrets in Space settings: `API_BASE_URL`, `MODEL_NAME`, `HF_TOKEN`
+4. The Space will build and deploy automatically from the Dockerfile
+### Validate Submission
+```bash
+# Install validator
+pip install openenv-core
+# Validate the spec
+openenv validate
+# Run the full submission validator script
+chmod +x scripts/validate-submission.sh
+./scripts/validate-submission.sh https://your-space.hf.space .
+```
+---
+## Common Mistakes to Avoid
+1. **Don't use `inference.py` as the wrong name.** The validator looks for exactly `inference.py` in the root.
+2. **Don't use the Anthropic SDK in inference.py.** The spec requires the OpenAI client. Use `from openai import OpenAI`.
+3. **Don't forget `flush=True` on print statements.** The validator reads stdout line by line. Without flush, logs may not appear.
+4. **Don't let the Gradio UI crash the FastAPI server.** If the UI has an error, it should fail gracefully, not bring down `/reset`.
+5. **Don't hardcode the model name.** Always read from `os.getenv("MODEL_NAME")`.
+6. **Don't put business logic in models.py.** That file is just data shapes.
+7. **Don't mutate documents during a step.** The documents (PO, Invoice, GRN) are fixed for the duration of an episode. Only EpisodeData changes.
+8. **Don't forget to test determinism.** Same seed + same actions must = same score. Run the determinism test.
+9. **Don't skip the docker build test.** The validator builds your Docker image. If it doesn't build, you're disqualified.
+10. **Don't forget the changelog.** Update `documents/CHANGELOG.md` before every push.
+---
+## License
+MIT License. See LICENSE file.

env/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+"""
+Invoice Exception Handler — OpenEnv environment package.
+Import the main environment class and supporting types from here:
+    from env import InvoiceExceptionEnv, Action, ALL_TASKS
+"""
+from .environment import InvoiceExceptionEnv
+from .models import Action, ActionType, EnvironmentState, StepResult
+from .tasks import ALL_TASKS, make_task
+__all__ = [
+    "InvoiceExceptionEnv",
+    "Action",
+    "ActionType",
+    "EnvironmentState",
+    "StepResult",
+    "ALL_TASKS",
+    "make_task",
+]

env/environment.py ADDED Viewed

	@@ -0,0 +1,339 @@

+"""
+InvoiceExceptionEnv — the main environment class.
+This is the only class external code needs to import. It wraps the task
+registry, dispatches actions, manages episode state, and provides the
+OpenEnv-compatible API: reset(), step(), state(), grade().
+"""
+from __future__ import annotations
+import random
+from typing import Any, Dict, List, Optional, Union
+from .models import (
+    Action, ActionType, CaseStatus, EnvironmentState, StepResult,
+)
+from .tasks import ALL_TASKS, BaseTask, EpisodeData, make_task
+class InvoiceExceptionEnv:
+    """
+    OpenEnv-compatible Invoice Exception Handler environment.
+    Usage:
+        env = InvoiceExceptionEnv(seed=42)
+        obs = env.reset("task1_price_variance")
+        result = env.step(Action.run_check("tolerance_rule"))
+        scores = env.grade()
+    """
+    def __init__(self, seed: Optional[int] = None) -> None:
+        """Initialise with an optional seed for reproducibility."""
+        self._rng = random.Random(seed)
+        self._task: Optional[BaseTask] = None
+        self._ep: Optional[EpisodeData] = None
+        self._state_cache: Optional[EnvironmentState] = None
+        self._done: bool = False
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+    def reset(self, task_id: Optional[str] = None) -> EnvironmentState:
+        """
+        Start a new episode. If task_id is None, picks one at random.
+        Returns the initial EnvironmentState showing all documents and
+        available actions.
+        """
+        if task_id is None:
+            task_id = self._rng.choice(ALL_TASKS)
+        self._task = make_task(task_id)
+        self._ep = EpisodeData()
+        self._done = False
+        self._state_cache = self._build_state()
+        return self._state_cache
+    def step(self, action: Union[Action, Dict[str, Any]]) -> StepResult:
+        """
+        Execute one action. Returns observation, reward, done flag, and
+        info dict. Raises RuntimeError if called before reset() or after
+        the episode is done.
+        """
+        if self._task is None or self._ep is None:
+            raise RuntimeError("Call reset() before step().")
+        if self._done:
+            raise RuntimeError("Episode is done. Call reset() to start a new one.")
+        # Convert dict to Action if needed
+        if isinstance(action, dict):
+            action = Action(
+                type=ActionType(action.get("type", action.get("action_type", ""))),
+                params=action.get("params", {}),
+            )
+        # Dispatch the action
+        reward, info = self._dispatch(action)
+        # Update episode
+        self._ep.step_count += 1
+        self._ep.cumulative_reward += reward
+        # Check SLA breach
+        sla_penalty = 0.0
+        if self._ep.step_count >= self._task.max_steps:
+            sla_penalty = -0.10
+            self._done = True
+            info["sla_breach"] = True
+        # Check done conditions
+        if self._ep.closed:
+            self._done = True
+        total_reward = reward + sla_penalty
+        self._ep.cumulative_reward += sla_penalty  # add SLA penalty separately
+        # Rebuild state
+        self._state_cache = self._build_state()
+        return StepResult(
+            observation=self._state_cache,
+            reward=round(total_reward, 4),
+            done=self._done,
+            info=info,
+        )
+    def state(self) -> EnvironmentState:
+        """Return the current state without advancing the episode."""
+        if self._state_cache is None:
+            raise RuntimeError("Call reset() before state().")
+        return self._state_cache
+    def grade(self) -> Dict[str, float]:
+        """Run the task grader on the current episode and return scores."""
+        if self._task is None or self._ep is None:
+            raise RuntimeError("Call reset() before grade().")
+        return self._task.grade(self._ep)
+    def action_space_sample(self) -> Action:
+        """Return a random valid action for baseline/testing purposes."""
+        if self._task is None:
+            raise RuntimeError("Call reset() before action_space_sample().")
+        action_type = self._rng.choice(list(ActionType))
+        if action_type == ActionType.INSPECT_FIELD:
+            doc = self._rng.choice(["invoice", "po", "grn", "supplier_master"])
+            field = self._rng.choice(["line_items", "total_amount", "bank_account",
+                                       "supplier_gstin", "items_received"])
+            return Action.inspect_field(doc, field)
+        elif action_type == ActionType.CROSS_CHECK:
+            field = self._rng.choice(["unit_price", "total_amount", "bank_account",
+                                       "gstin", "quantity"])
+            doc_a = self._rng.choice(["invoice", "po"])
+            doc_b = self._rng.choice(["po", "grn", "supplier_master"])
+            return Action.cross_check(field, doc_a, doc_b)
+        elif action_type == ActionType.RUN_CHECK:
+            check = self._rng.choice(self._task.available_checks)
+            return Action.run_check(check)
+        elif action_type == ActionType.QUERY_SUPPLIER:
+            channel = self._rng.choice(["email", "phone"])
+            return Action.query_supplier("What is the status?", channel)
+        elif action_type == ActionType.QUERY_INTERNAL:
+            dept = self._rng.choice(["procurement", "finance", "legal", "security"])
+            return Action.query_internal(dept, "Can you provide information?")
+        elif action_type == ActionType.APPLY_RULE:
+            rule = self._rng.choice(self._task.available_rules)
+            return Action.apply_rule(rule)
+        elif action_type == ActionType.MAKE_DECISION:
+            decision = self._rng.choice(["approve", "reject", "hold", "partial_approve"])
+            return Action.make_decision(decision, "Random baseline decision.")
+        elif action_type == ActionType.ROUTE_TO:
+            team = self._rng.choice(["procurement", "finance", "legal", "security"])
+            return Action.route_to(team, "Random baseline routing.")
+        elif action_type == ActionType.CLOSE_CASE:
+            return Action.close_case("Random baseline closure.")
+        # Fallback
+        return Action.run_check(self._task.available_checks[0])
+    # ------------------------------------------------------------------
+    # Internal methods
+    # ------------------------------------------------------------------
+    def _dispatch(self, action: Action) -> tuple:
+        """
+        Route an action to the appropriate task simulator.
+        Returns (reward, info dict). Handles repeat-action penalties.
+        """
+        params = action.params
+        info: Dict[str, Any] = {"action_type": action.type.value}
+        if action.type == ActionType.INSPECT_FIELD:
+            doc = params.get("document", "")
+            field = params.get("field", "")
+            # Repeat penalty
+            if self._ep.has_inspected(doc, field):
+                info["repeat"] = True
+                return -0.02, info
+            result, reward = self._task.simulate_inspect(doc, field)
+            self._ep.inspections.append(result)
+            info["result"] = result.model_dump()
+            return reward, info
+        elif action.type == ActionType.CROSS_CHECK:
+            field = params.get("field", "")
+            doc_a = params.get("doc_a", "")
+            doc_b = params.get("doc_b", "")
+            check_key = f"cross_{field}_{doc_a}_{doc_b}"
+            if self._ep.has_checked(check_key):
+                info["repeat"] = True
+                return -0.03, info
+            result, reward = self._task.simulate_cross_check(field, doc_a, doc_b)
+            self._ep.checks.append(result)
+            info["result"] = result.model_dump()
+            return reward, info
+        elif action.type == ActionType.RUN_CHECK:
+            check_name = params.get("check_name", "")
+            if self._ep.has_checked(check_name):
+                info["repeat"] = True
+                return -0.03, info
+            result, reward = self._task.simulate_run_check(check_name)
+            self._ep.checks.append(result)
+            info["result"] = result.model_dump()
+            return reward, info
+        elif action.type == ActionType.QUERY_SUPPLIER:
+            question = params.get("question", "")
+            channel = params.get("channel", "email")
+            if self._ep.has_queried("supplier"):
+                info["repeat"] = True
+                return -0.05, info
+            result, reward = self._task.simulate_query_supplier(question, channel)
+            self._ep.queries.append(result)
+            info["result"] = result.model_dump()
+            return reward, info
+        elif action.type == ActionType.QUERY_INTERNAL:
+            department = params.get("department", "")
+            question = params.get("question", "")
+            if self._ep.has_queried(department.lower()):
+                info["repeat"] = True
+                return -0.03, info
+            result, reward = self._task.simulate_query_internal(department, question)
+            self._ep.queries.append(result)
+            info["result"] = result.model_dump()
+            return reward, info
+        elif action.type == ActionType.APPLY_RULE:
+            rule_id = params.get("rule_id", "")
+            if rule_id in self._ep.rules_applied:
+                info["repeat"] = True
+                return -0.03, info
+            detail, reward = self._task.simulate_apply_rule(rule_id)
+            self._ep.rules_applied.append(rule_id)
+            info["detail"] = detail
+            return reward, info
+        elif action.type == ActionType.MAKE_DECISION:
+            decision = params.get("decision", "")
+            reason = params.get("reason", "")
+            if self._ep.decision is not None:
+                info["repeat"] = True
+                return -0.05, info
+            reward = self._task.simulate_make_decision(decision, reason, self._ep)
+            self._ep.decision = decision
+            self._ep.decision_reason = reason
+            info["decision"] = decision
+            return reward, info
+        elif action.type == ActionType.ROUTE_TO:
+            team = params.get("team", "")
+            notes = params.get("notes", "")
+            if team.lower() in self._ep.routed_to:
+                info["repeat"] = True
+                return -0.02, info
+            reward = self._task.simulate_route_to(team, notes, self._ep)
+            self._ep.routed_to.append(team.lower())
+            info["routed_to"] = team
+            return reward, info
+        elif action.type == ActionType.CLOSE_CASE:
+            summary = params.get("summary", "")
+            if self._ep.closed:
+                info["repeat"] = True
+                return -0.05, info
+            reward = self._task.simulate_close(summary, self._ep)
+            self._ep.closed = True
+            self._ep.close_summary = summary
+            info["closed"] = True
+            return reward, info
+        # Unknown action type
+        return 0.0, {"error": f"Unknown action type: {action.type}"}
+    def _build_state(self) -> EnvironmentState:
+        """Construct an EnvironmentState from current task and episode data."""
+        # Determine case status
+        if self._ep.closed:
+            status = CaseStatus.CLOSED
+        elif self._ep.routed_to:
+            status = CaseStatus.ROUTED
+        elif self._ep.decision is not None:
+            status = CaseStatus.DECIDED
+        elif self._ep.step_count > 0:
+            status = CaseStatus.IN_REVIEW
+        else:
+            status = CaseStatus.OPEN
+        return EnvironmentState(
+            task_id=self._task.task_id,
+            step_number=self._ep.step_count,
+            case_status=status,
+            purchase_order=self._task.get_purchase_order(),
+            invoice=self._task.get_invoice(),
+            grn=self._task.get_grn(),
+            supplier_master=self._task.get_supplier_master(),
+            exception_flag=self._task.get_exception_flag(),
+            inspections=list(self._ep.inspections),
+            checks_run=list(self._ep.checks),
+            queries=list(self._ep.queries),
+            rules_applied=list(self._ep.rules_applied),
+            decision=self._ep.decision,
+            decision_reason=self._ep.decision_reason,
+            routed_to=list(self._ep.routed_to),
+            case_closed=self._ep.closed,
+            close_summary=self._ep.close_summary,
+            available_actions=[at.value for at in ActionType],
+            available_checks=self._task.available_checks,
+            available_rules=self._task.available_rules,
+            knowledge_base=self._task.knowledge_base,
+            cumulative_reward=round(self._ep.cumulative_reward, 4),
+        )

env/models.py ADDED Viewed

	@@ -0,0 +1,276 @@

+"""
+Typed models for the Invoice Exception Handler OpenEnv environment.
+Every object the agent sees or produces is defined here as a Pydantic model.
+This is the single source of truth for the data contract between the
+environment simulation and the agent.
+"""
+from __future__ import annotations
+import time
+from enum import Enum
+from typing import Any, Dict, List, Optional
+from pydantic import BaseModel, Field
+# ---------------------------------------------------------------------------
+# Enumerations
+# ---------------------------------------------------------------------------
+class ActionType(str, Enum):
+    """The nine action types an agent can take during an episode."""
+    INSPECT_FIELD  = "inspect_field"
+    CROSS_CHECK    = "cross_check"
+    RUN_CHECK      = "run_check"
+    QUERY_SUPPLIER = "query_supplier"
+    QUERY_INTERNAL = "query_internal"
+    APPLY_RULE     = "apply_rule"
+    MAKE_DECISION  = "make_decision"
+    ROUTE_TO       = "route_to"
+    CLOSE_CASE     = "close_case"
+class DecisionType(str, Enum):
+    """Possible decisions the agent can make on a flagged invoice."""
+    APPROVE         = "approve"
+    REJECT          = "reject"
+    HOLD            = "hold"
+    PARTIAL_APPROVE = "partial_approve"
+class CaseStatus(str, Enum):
+    """Lifecycle status of an invoice exception case."""
+    OPEN      = "open"
+    IN_REVIEW = "in_review"
+    DECIDED   = "decided"
+    ROUTED    = "routed"
+    CLOSED    = "closed"
+# ---------------------------------------------------------------------------
+# Document models — read-only context given to the agent
+# ---------------------------------------------------------------------------
+class LineItem(BaseModel):
+    """One line on an invoice or purchase order."""
+    description: str = Field(..., description="Item description")
+    quantity: int = Field(..., description="Number of units")
+    unit_price: float = Field(..., description="Price per unit in INR")
+    total: float = Field(..., description="Line total in INR (quantity × unit_price)")
+    tax_rate: Optional[float] = Field(None, description="Tax rate as a percentage, if applicable")
+class PurchaseOrder(BaseModel):
+    """What was agreed to be purchased."""
+    po_number: str = Field(..., description="Unique PO identifier")
+    vendor_name: str = Field(..., description="Supplier name on the PO")
+    po_date: str = Field(..., description="Date the PO was raised (YYYY-MM-DD)")
+    line_items: List[LineItem] = Field(default_factory=list, description="Items on the PO")
+    total_amount: float = Field(..., description="Total PO value in INR")
+    payment_terms: str = Field("Net-30", description="Payment terms")
+    currency: str = Field("INR", description="Currency code")
+class Invoice(BaseModel):
+    """What the supplier is claiming — the document under exception review."""
+    invoice_number: str = Field(..., description="Unique invoice identifier")
+    supplier_name: str = Field(..., description="Supplier name on the invoice")
+    invoice_date: str = Field(..., description="Date of the invoice (YYYY-MM-DD)")
+    due_date: str = Field(..., description="Payment due date (YYYY-MM-DD)")
+    po_reference: str = Field(..., description="PO number referenced by this invoice")
+    line_items: List[LineItem] = Field(default_factory=list, description="Items invoiced")
+    subtotal: float = Field(..., description="Pre-tax total in INR")
+    tax_amount: float = Field(..., description="Total tax amount in INR")
+    tax_rate: float = Field(..., description="Applied tax rate as a percentage")
+    total_amount: float = Field(..., description="Grand total including tax in INR")
+    bank_account: str = Field(..., description="Supplier bank account on the invoice")
+    bank_name: str = Field("", description="Bank name")
+    ifsc_code: str = Field("", description="IFSC / routing code")
+    supplier_gstin: str = Field("", description="GST Identification Number on the invoice")
+    supplier_email: str = Field("", description="Email address on the invoice")
+    currency: str = Field("INR", description="Currency code")
+class GoodsReceiptNote(BaseModel):
+    """What actually arrived at the warehouse (or service confirmation)."""
+    grn_number: str = Field(..., description="Unique GRN identifier")
+    po_reference: str = Field(..., description="PO number this receipt is against")
+    receipt_date: str = Field(..., description="Date goods/services were received (YYYY-MM-DD)")
+    items_received: List[Dict[str, Any]] = Field(
+        default_factory=list,
+        description="List of received item dicts with description, quantity_received, quantity_pending, quantity_rejected"
+    )
+    receiving_officer: str = Field("", description="Person who signed the receipt")
+    notes: str = Field("", description="Any delivery notes or discrepancies observed")
+class SupplierMaster(BaseModel):
+    """The verified, registered supplier record in the company's ERP system."""
+    supplier_id: str = Field(..., description="Internal supplier code")
+    supplier_name: str = Field(..., description="Registered legal name")
+    registered_address: str = Field("", description="Registered business address")
+    gstin: str = Field(..., description="Verified GST Identification Number")
+    bank_account: str = Field(..., description="Verified bank account number")
+    bank_name: str = Field("", description="Bank name")
+    ifsc_code: str = Field("", description="Verified IFSC / routing code")
+    contact_email: str = Field("", description="Registered email address")
+    contact_phone: str = Field("", description="Registered phone number")
+    registered_domain: str = Field("", description="Verified email domain for the supplier")
+    pan_number: str = Field("", description="PAN (tax ID)")
+    status: str = Field("active", description="Supplier status: active, suspended, blacklisted")
+class ExceptionFlag(BaseModel):
+    """Why the AP system flagged this invoice for manual review."""
+    flag_code: str = Field(..., description="Machine-readable code, e.g. PRICE_MISMATCH")
+    flag_description: str = Field(..., description="Human-readable explanation of the flag")
+    auto_hold: bool = Field(False, description="Whether the system placed an automatic payment hold")
+    flagged_date: str = Field("", description="Date the flag was raised (YYYY-MM-DD)")
+    severity: str = Field("medium", description="low / medium / high / critical")
+# ---------------------------------------------------------------------------
+# Action model
+# ---------------------------------------------------------------------------
+class Action(BaseModel):
+    """
+    An action the agent wants to take.
+    Use the classmethod constructors for convenience:
+        Action.run_check("tolerance_rule")
+        Action.make_decision("approve", "reason here")
+    """
+    type: ActionType = Field(..., description="Which action type to execute")
+    params: Dict[str, Any] = Field(default_factory=dict, description="Parameters for the action")
+    # --- Classmethod constructors for each action type ---
+    @classmethod
+    def inspect_field(cls, document: str, field: str) -> Action:
+        """Look at a specific field in a document."""
+        return cls(type=ActionType.INSPECT_FIELD, params={"document": document, "field": field})
+    @classmethod
+    def cross_check(cls, field: str, doc_a: str, doc_b: str) -> Action:
+        """Compare a field between two documents."""
+        return cls(type=ActionType.CROSS_CHECK, params={"field": field, "doc_a": doc_a, "doc_b": doc_b})
+    @classmethod
+    def run_check(cls, check_name: str) -> Action:
+        """Run a named validation check."""
+        return cls(type=ActionType.RUN_CHECK, params={"check_name": check_name})
+    @classmethod
+    def query_supplier(cls, question: str, channel: str = "email") -> Action:
+        """Ask the supplier a question via a specific channel."""
+        return cls(type=ActionType.QUERY_SUPPLIER, params={"question": question, "channel": channel})
+    @classmethod
+    def query_internal(cls, department: str, question: str) -> Action:
+        """Ask an internal department a question."""
+        return cls(type=ActionType.QUERY_INTERNAL, params={"department": department, "question": question})
+    @classmethod
+    def apply_rule(cls, rule_id: str) -> Action:
+        """Apply a named business policy rule."""
+        return cls(type=ActionType.APPLY_RULE, params={"rule_id": rule_id})
+    @classmethod
+    def make_decision(cls, decision: str, reason: str) -> Action:
+        """Make a case decision with a documented reason."""
+        return cls(type=ActionType.MAKE_DECISION, params={"decision": decision, "reason": reason})
+    @classmethod
+    def route_to(cls, team: str, notes: str = "") -> Action:
+        """Escalate the case to a specific team."""
+        return cls(type=ActionType.ROUTE_TO, params={"team": team, "notes": notes})
+    @classmethod
+    def close_case(cls, summary: str) -> Action:
+        """Close the case with an audit trail summary."""
+        return cls(type=ActionType.CLOSE_CASE, params={"summary": summary})
+# ---------------------------------------------------------------------------
+# Result models — returned by simulators
+# ---------------------------------------------------------------------------
+class InspectionResult(BaseModel):
+    """What came back from inspecting a specific field in a document."""
+    document: str = Field(..., description="Which document was inspected")
+    field: str = Field(..., description="Which field was inspected")
+    value: Any = Field(..., description="The value found in that field")
+    note: str = Field("", description="Any contextual note about the value")
+    timestamp: float = Field(default_factory=time.time, description="When the inspection happened")
+class CheckResult(BaseModel):
+    """What came back from running a validation check or cross-check."""
+    check_name: str = Field(..., description="Name of the check that was run")
+    passed: bool = Field(..., description="Whether the check passed (True) or failed (False)")
+    detail: str = Field("", description="Human-readable detail of what was found")
+    timestamp: float = Field(default_factory=time.time, description="When the check was run")
+class QueryResult(BaseModel):
+    """What came back from querying a supplier or internal department."""
+    target: str = Field(..., description="Who was queried (supplier, procurement, finance, etc.)")
+    question: str = Field("", description="The question that was asked")
+    response: str = Field(..., description="The response received")
+    channel: str = Field("email", description="Communication channel used (email, phone, etc.)")
+    timestamp: float = Field(default_factory=time.time, description="When the query was made")
+# ---------------------------------------------------------------------------
+# State models
+# ---------------------------------------------------------------------------
+class EnvironmentState(BaseModel):
+    """
+    The full observable state returned by reset() and step().
+    This is what the agent sees at every turn — all documents, all history,
+    and all available actions/checks/rules for the current task.
+    """
+    task_id: str = Field(..., description="Which task is currently running")
+    step_number: int = Field(0, description="Current step number in the episode")
+    case_status: CaseStatus = Field(CaseStatus.OPEN, description="Current lifecycle status")
+    # The five documents
+    purchase_order: PurchaseOrder = Field(..., description="The purchase order")
+    invoice: Invoice = Field(..., description="The invoice under review")
+    grn: GoodsReceiptNote = Field(..., description="The goods receipt note")
+    supplier_master: SupplierMaster = Field(..., description="The verified supplier record")
+    exception_flag: ExceptionFlag = Field(..., description="Why this invoice was flagged")
+    # Agent history — what has been done so far
+    inspections: List[InspectionResult] = Field(default_factory=list, description="Fields inspected")
+    checks_run: List[CheckResult] = Field(default_factory=list, description="Checks completed")
+    queries: List[QueryResult] = Field(default_factory=list, description="Queries made")
+    rules_applied: List[str] = Field(default_factory=list, description="Rules applied")
+    # Decision state
+    decision: Optional[str] = Field(None, description="Current decision if one has been made")
+    decision_reason: Optional[str] = Field(None, description="Reason for the decision")
+    routed_to: List[str] = Field(default_factory=list, description="Teams case has been routed to")
+    case_closed: bool = Field(False, description="Whether the case has been closed")
+    close_summary: Optional[str] = Field(None, description="Closure summary if case is closed")
+    # Action hints — what the agent can do
+    available_actions: List[str] = Field(default_factory=list, description="All valid action types")
+    available_checks: List[str] = Field(default_factory=list, description="Check names for this task")
+    available_rules: List[str] = Field(default_factory=list, description="Rule IDs for this task")
+    knowledge_base: List[str] = Field(default_factory=list, description="Policy entries for this task")
+    # Running totals
+    cumulative_reward: float = Field(0.0, description="Sum of all rewards received so far")
+class StepResult(BaseModel):
+    """What step() returns — the observation, reward, done flag, and info dict."""
+    observation: EnvironmentState = Field(..., description="Updated environment state after the action")
+    reward: float = Field(..., description="Reward for this specific action")
+    done: bool = Field(False, description="Whether the episode is over")
+    info: Dict[str, Any] = Field(default_factory=dict, description="Extra info about the step")

env/tasks.py ADDED Viewed

	@@ -0,0 +1,984 @@

+"""
+Task definitions for the Invoice Exception Handler environment.
+Each task defines a scenario with documents, simulator logic for every action
+type, and a grader that produces sub-scores in [0.0, 1.0]. This is the biggest
+file in the project — it contains all the business logic the environment needs.
+"""
+from __future__ import annotations
+import time
+from typing import Any, Dict, List, Optional, Tuple
+from .models import (
+    ActionType, CheckResult, ExceptionFlag, GoodsReceiptNote,
+    InspectionResult, Invoice, LineItem, PurchaseOrder, QueryResult,
+    SupplierMaster,
+)
+# ---------------------------------------------------------------------------
+# EpisodeData — mutable state for one episode
+# ---------------------------------------------------------------------------
+class EpisodeData:
+    """Tracks the full history of one episode for grading and state building."""
+    def __init__(self) -> None:
+        self.inspections: List[InspectionResult] = []
+        self.checks: List[CheckResult] = []
+        self.queries: List[QueryResult] = []
+        self.rules_applied: List[str] = []
+        self.decision: Optional[str] = None
+        self.decision_reason: Optional[str] = None
+        self.routed_to: List[str] = []
+        self.closed: bool = False
+        self.close_summary: Optional[str] = None
+        self.step_count: int = 0
+        self.cumulative_reward: float = 0.0
+    def has_inspected(self, doc: str, field: str) -> bool:
+        """Check if we already looked at this field in this document."""
+        return any(i.document == doc and i.field == field for i in self.inspections)
+    def has_checked(self, name: str) -> bool:
+        """Check if this validation check has already been run."""
+        return any(c.check_name == name for c in self.checks)
+    def has_queried(self, target: str) -> bool:
+        """Check if we already queried this person or department."""
+        return any(q.target == target for q in self.queries)
+# ---------------------------------------------------------------------------
+# BaseTask — abstract interface
+# ---------------------------------------------------------------------------
+class BaseTask:
+    """Abstract base that all task classes inherit from."""
+    task_id: str = "base"
+    max_steps: int = 20
+    difficulty: str = "easy"
+    def get_purchase_order(self) -> PurchaseOrder:
+        raise NotImplementedError
+    def get_invoice(self) -> Invoice:
+        raise NotImplementedError
+    def get_grn(self) -> GoodsReceiptNote:
+        raise NotImplementedError
+    def get_supplier_master(self) -> SupplierMaster:
+        raise NotImplementedError
+    def get_exception_flag(self) -> ExceptionFlag:
+        raise NotImplementedError
+    def simulate_inspect(self, document: str, field: str) -> Tuple[InspectionResult, float]:
+        raise NotImplementedError
+    def simulate_cross_check(self, field: str, doc_a: str, doc_b: str) -> Tuple[CheckResult, float]:
+        raise NotImplementedError
+    def simulate_run_check(self, check_name: str) -> Tuple[CheckResult, float]:
+        raise NotImplementedError
+    def simulate_query_supplier(self, question: str, channel: str) -> Tuple[QueryResult, float]:
+        raise NotImplementedError
+    def simulate_query_internal(self, department: str, question: str) -> Tuple[QueryResult, float]:
+        raise NotImplementedError
+    def simulate_apply_rule(self, rule_id: str) -> Tuple[str, float]:
+        raise NotImplementedError
+    def simulate_make_decision(self, decision: str, reason: str, ep: EpisodeData) -> float:
+        raise NotImplementedError
+    def simulate_route_to(self, team: str, notes: str, ep: EpisodeData) -> float:
+        raise NotImplementedError
+    def simulate_close(self, summary: str, ep: EpisodeData) -> float:
+        raise NotImplementedError
+    def grade(self, ep: EpisodeData) -> Dict[str, float]:
+        raise NotImplementedError
+    @property
+    def available_checks(self) -> List[str]:
+        return []
+    @property
+    def available_rules(self) -> List[str]:
+        return []
+    @property
+    def knowledge_base(self) -> List[str]:
+        return []
+# ---------------------------------------------------------------------------
+# Task 1 — Price Variance Exception (Easy)
+# ---------------------------------------------------------------------------
+class PriceVarianceTask(BaseTask):
+    """
+    Office stationery invoice arrives 3.08% above the PO.
+    Company tolerance is +/-2% auto-approval. Supplier had verbal approval
+    from procurement for the price increase but the PO was never updated.
+    Optimal path: check tolerance -> cross-check prices -> verify GRN ->
+    query supplier -> query procurement -> apply exception rule -> approve ->
+    route to procurement for PO amendment -> close.
+    """
+    task_id = "task1_price_variance"
+    max_steps = 18
+    difficulty = "easy"
+    def get_purchase_order(self) -> PurchaseOrder:
+        return PurchaseOrder(
+            po_number="PO-2024-1041",
+            vendor_name="OfficeNeed Supplies",
+            po_date="2024-02-15",
+            line_items=[
+                LineItem(description="A4 Paper", quantity=100, unit_price=220.0, total=22000.0, tax_rate=18.0),
+                LineItem(description="Ballpoint Pens", quantity=20, unit_price=450.0, total=9000.0, tax_rate=18.0),
+                LineItem(description="Staplers", quantity=10, unit_price=1900.0, total=19000.0, tax_rate=18.0),
+            ],
+            total_amount=50000.0,
+            payment_terms="Net-30",
+        )
+    def get_invoice(self) -> Invoice:
+        return Invoice(
+            invoice_number="INV-ON-8821",
+            supplier_name="OfficeNeed Supplies",
+            invoice_date="2024-03-05",
+            due_date="2024-04-04",
+            po_reference="PO-2024-1041",
+            line_items=[
+                LineItem(description="A4 Paper", quantity=100, unit_price=231.0, total=23100.0, tax_rate=18.0),
+                LineItem(description="Ballpoint Pens", quantity=20, unit_price=472.0, total=9440.0, tax_rate=18.0),
+                LineItem(description="Staplers", quantity=10, unit_price=1900.0, total=19000.0, tax_rate=18.0),
+            ],
+            subtotal=51540.0,
+            tax_amount=9277.20,
+            tax_rate=18.0,
+            total_amount=60817.20,
+            bank_account="9876543210",
+            bank_name="HDFC Bank",
+            ifsc_code="HDFC0001234",
+            supplier_gstin="29AABCO1234F1Z5",
+            supplier_email="accounts@officeneed.com",
+        )
+    def get_grn(self) -> GoodsReceiptNote:
+        return GoodsReceiptNote(
+            grn_number="GRN-2024-0892",
+            po_reference="PO-2024-1041",
+            receipt_date="2024-03-01",
+            items_received=[
+                {"description": "A4 Paper", "quantity_received": 100, "quantity_pending": 0, "quantity_rejected": 0},
+                {"description": "Ballpoint Pens", "quantity_received": 20, "quantity_pending": 0, "quantity_rejected": 0},
+                {"description": "Staplers", "quantity_received": 10, "quantity_pending": 0, "quantity_rejected": 0},
+            ],
+            receiving_officer="Ramesh Kumar",
+            notes="All items received in good condition.",
+        )
+    def get_supplier_master(self) -> SupplierMaster:
+        return SupplierMaster(
+            supplier_id="SUP-0441",
+            supplier_name="OfficeNeed Supplies",
+            registered_address="45 MG Road, Bengaluru 560001",
+            gstin="29AABCO1234F1Z5",
+            bank_account="9876543210",
+            bank_name="HDFC Bank",
+            ifsc_code="HDFC0001234",
+            contact_email="sales@officeneed.com",
+            contact_phone="+91-80-4567-8901",
+            registered_domain="officeneed.com",
+            pan_number="AABCO1234F",
+            status="active",
+        )
+    def get_exception_flag(self) -> ExceptionFlag:
+        return ExceptionFlag(
+            flag_code="PRICE_MISMATCH",
+            flag_description=(
+                "Invoice total ₹51,540 exceeds PO ₹50,000 by ₹1,540 (3.08%). "
+                "Above auto-approval threshold."
+            ),
+            auto_hold=True,
+            flagged_date="2024-03-06",
+            severity="medium",
+        )
+    @property
+    def available_checks(self) -> List[str]:
+        return ["tolerance_rule", "grn_match", "duplicate_detection",
+                "bank_account_verification", "gst_verification", "po_match"]
+    @property
+    def available_rules(self) -> List[str]:
+        return ["tolerance_2pct_auto_approve", "tolerance_exception_approval",
+                "rejection_with_reason", "partial_approval"]
+    @property
+    def knowledge_base(self) -> List[str]:
+        return [
+            "POL-001: Price variance ≤±2% may be auto-approved. Above 2% requires exception approval.",
+            "POL-002: Exception approval requires confirmation from originating department.",
+            "POL-003: Any approved invoice with a price change must be followed by a PO amendment request.",
+            "POL-004: Bank account on invoice must match supplier master.",
+        ]
+    # --- Simulators ---
+    def simulate_inspect(self, document: str, field: str) -> Tuple[InspectionResult, float]:
+        """Return meaningful values for key fields, small reward for others."""
+        key_fields = {
+            ("invoice", "line_items"): ("A4 Paper @₹231 (+5%), Pens @₹472 (+4.9%), Staplers @₹1900 (unchanged)", 0.10),
+            ("invoice", "total_amount"): ("₹51,540 (subtotal) + ₹9,277.20 (GST 18%) = ₹60,817.20", 0.08),
+            ("po", "line_items"): ("A4 Paper @₹220, Pens @₹450, Staplers @₹1900. Total: ₹50,000", 0.06),
+            ("grn", "items_received"): ("All 3 items fully received. No pending, no rejected.", 0.05),
+            ("invoice", "bank_account"): ("9876543210 — HDFC Bank, IFSC HDFC0001234", 0.02),
+            ("invoice", "supplier_gstin"): ("29AABCO1234F1Z5", 0.02),
+        }
+        key = (document.lower(), field.lower())
+        value, reward = key_fields.get(key, (f"{document}.{field} — no anomaly detected", 0.01))
+        result = InspectionResult(document=document, field=field, value=value, note="")
+        return result, reward
+    def simulate_cross_check(self, field: str, doc_a: str, doc_b: str) -> Tuple[CheckResult, float]:
+        """Cross-check a field between two documents."""
+        checks = {
+            ("unit_price", "invoice", "po"): (False, "MISMATCH: A4 Paper ₹231 vs ₹220 (+5.0%), Pens ₹472 vs ₹450 (+4.9%). Staplers match.", 0.12),
+            ("total_amount", "invoice", "po"): (False, "Invoice subtotal ₹51,540 vs PO ₹50,000. Variance: +₹1,540 (+3.08%).", 0.10),
+            ("bank_account", "invoice", "supplier_master"): (True, "Bank account 9876543210 matches supplier master.", 0.03),
+            ("gstin", "invoice", "supplier_master"): (True, "GSTIN 29AABCO1234F1Z5 matches supplier master.", 0.02),
+            ("quantity", "invoice", "grn"): (True, "All quantities match: 100 reams, 20 boxes, 10 units.", 0.04),
+        }
+        key = (field.lower(), doc_a.lower(), doc_b.lower())
+        passed, detail, reward = checks.get(key, (True, f"No mismatch found for {field} between {doc_a} and {doc_b}.", 0.01))
+        result = CheckResult(check_name=f"cross_{field}_{doc_a}_{doc_b}", passed=passed, detail=detail)
+        return result, reward
+    def simulate_run_check(self, check_name: str) -> Tuple[CheckResult, float]:
+        """Run a named validation check."""
+        checks = {
+            "tolerance_rule": (False, "Price variance 3.08% exceeds ±2% auto-approval threshold. Manual exception approval required.", 0.14),
+            "grn_match": (True, "All items fully received. GRN matches invoice quantities.", 0.06),
+            "duplicate_detection": (True, "No duplicate invoice found in payment history.", 0.02),
+            "bank_account_verification": (True, "Bank account matches supplier master record.", 0.02),
+            "gst_verification": (True, "GSTIN matches supplier master. GST calculation correct.", 0.02),
+            "po_match": (False, "PO match FAILED on unit prices: 2 of 3 line items have price variance.", 0.08),
+        }
+        passed, detail, reward = checks.get(check_name, (True, f"Check '{check_name}' passed — no issues found.", 0.01))
+        result = CheckResult(check_name=check_name, passed=passed, detail=detail)
+        return result, reward
+    def simulate_query_supplier(self, question: str, channel: str) -> Tuple[QueryResult, float]:
+        """Query the supplier — returns email explaining the price increase."""
+        response = (
+            "Dear Sir/Madam, due to a 12% increase in raw material costs effective January 2024, "
+            "we revised prices for A4 Paper and Ballpoint Pens. This was communicated to Mr. Arjun Mehta "
+            "in your Procurement team via email on Feb 20, 2024. He acknowledged and verbally approved "
+            "the revised pricing. We can provide the email trail if needed. — OfficeNeed Supplies"
+        )
+        result = QueryResult(target="supplier", question=question, response=response, channel=channel)
+        return result, 0.10
+    def simulate_query_internal(self, department: str, question: str) -> Tuple[QueryResult, float]:
+        """Query an internal department."""
+        if department.lower() == "procurement":
+            response = (
+                "Hi, this is Arjun Mehta from Procurement. Yes, I received the price revision email "
+                "from OfficeNeed on Feb 20. I verbally approved it as the increase was reasonable "
+                "(raw material cost pass-through). I should have raised a PO amendment but it slipped. "
+                "I'll raise the amendment today. Please go ahead and approve the invoice."
+            )
+            return QueryResult(target="procurement", question=question, response=response, channel="internal"), 0.12
+        response = f"{department.title()} department: We don't have specific information about this invoice exception."
+        return QueryResult(target=department.lower(), question=question, response=response, channel="internal"), 0.03
+    def simulate_apply_rule(self, rule_id: str) -> Tuple[str, float]:
+        """Apply a business rule."""
+        rules = {
+            "tolerance_2pct_auto_approve": ("BLOCKED: Cannot auto-approve. Price variance 3.08% exceeds ±2% threshold.", -0.05),
+            "tolerance_exception_approval": ("APPLIED: Exception approval pathway activated. Requires department confirmation (obtained from procurement).", 0.10),
+            "rejection_with_reason": ("APPLIED: Rejection rule activated. Invoice will be returned to supplier.", -0.08),
+            "partial_approval": ("NOT APPLICABLE: All items received in full. Partial approval not warranted.", -0.05),
+        }
+        detail, reward = rules.get(rule_id, (f"Rule '{rule_id}' not found in policy database.", -0.03))
+        return detail, reward
+    def simulate_make_decision(self, decision: str, reason: str, ep: EpisodeData) -> float:
+        """Score the agent's decision based on evidence gathered."""
+        checks_run = {c.check_name for c in ep.checks}
+        queries_to = {q.target for q in ep.queries}
+        if decision == "approve":
+            if "tolerance_rule" in checks_run and "procurement" in queries_to:
+                return 0.25
+            elif "tolerance_rule" in checks_run:
+                return 0.18
+            else:
+                return 0.05
+        elif decision == "reject":
+            return -0.10
+        elif decision == "hold":
+            return 0.08
+        return 0.0
+    def simulate_route_to(self, team: str, notes: str, ep: EpisodeData) -> float:
+        """Score routing decisions."""
+        routes = {"procurement": 0.12, "finance": 0.03, "legal": -0.05}
+        return routes.get(team.lower(), 0.0)
+    def simulate_close(self, summary: str, ep: EpisodeData) -> float:
+        """Score case closure."""
+        checks_run = {c.check_name for c in ep.checks}
+        if ep.decision == "approve" and "tolerance_rule" in checks_run and "procurement" in set(ep.routed_to):
+            return 0.12
+        elif ep.decision is not None:
+            return 0.06
+        return 0.0
+    def grade(self, ep: EpisodeData) -> Dict[str, float]:
+        """Final grader producing sub-scores."""
+        checks_run = {c.check_name for c in ep.checks}
+        queries_to = {q.target for q in ep.queries}
+        # Diagnosis
+        d = 0.0
+        if any("unit_price" in c.check_name or "total" in c.check_name for c in ep.checks):
+            d += 0.12
+        if "tolerance_rule" in checks_run:
+            d += 0.14
+        if "grn_match" in checks_run:
+            d += 0.06
+        # Investigation
+        i = 0.0
+        if "supplier" in queries_to:
+            i += 0.10
+        if "procurement" in queries_to:
+            i += 0.12
+        if "tolerance_exception_approval" in ep.rules_applied:
+            i += 0.08
+        # Decision
+        dec = 0.0
+        if ep.decision == "approve":
+            dec += 0.18
+        elif ep.decision == "hold":
+            dec += 0.06
+        elif ep.decision == "reject":
+            dec -= 0.10
+        # Routing
+        route = 0.12 if "procurement" in ep.routed_to else 0.0
+        # Closure
+        closure = 0.08 if ep.closed else 0.0
+        # Efficiency
+        eff = max(0.0, 0.06 - 0.004 * max(0, ep.step_count - 9))
+        total = d + i + dec + route + closure + eff
+        return {
+            "score": round(max(0.0, min(1.0, total)), 4),
+            "diagnosis_score": round(d, 4),
+            "investigation_score": round(i, 4),
+            "decision_score": round(dec, 4),
+            "routing_score": round(route, 4),
+            "closure_score": round(closure, 4),
+            "efficiency_score": round(eff, 4),
+        }
+# ---------------------------------------------------------------------------
+# Task 2 — Duplicate Invoice with Hidden Tax Error (Medium)
+# ---------------------------------------------------------------------------
+class DuplicateTaxErrorTask(BaseTask):
+    """
+    Logistics supplier submits INV-2024-891 which is a duplicate of already-paid
+    INV-2024-819 (digit transposition). The original invoice applied 15% GST
+    (wrong), correct rate is 18%. Company overpaid ₹3,240. The new invoice has
+    the correct rate. It's both a duplicate AND a legitimate correction.
+    """
+    task_id = "task2_duplicate_tax"
+    max_steps = 20
+    difficulty = "medium"
+    def get_purchase_order(self) -> PurchaseOrder:
+        return PurchaseOrder(
+            po_number="PO-2024-0778",
+            vendor_name="FastMove Logistics",
+            po_date="2024-01-25",
+            line_items=[
+                LineItem(description="Mumbai-Pune Transport", quantity=20, unit_price=4500.0, total=90000.0, tax_rate=18.0),
+                LineItem(description="Warehousing charges Feb 2024", quantity=1, unit_price=18000.0, total=18000.0, tax_rate=18.0),
+            ],
+            total_amount=108000.0,
+            payment_terms="Net-15",
+        )
+    def get_invoice(self) -> Invoice:
+        return Invoice(
+            invoice_number="INV-2024-891",
+            supplier_name="FastMove Logistics",
+            invoice_date="2024-03-12",
+            due_date="2024-03-27",
+            po_reference="PO-2024-0778",
+            line_items=[
+                LineItem(description="Mumbai-Pune Transport", quantity=20, unit_price=4500.0, total=90000.0, tax_rate=18.0),
+                LineItem(description="Warehousing charges Feb 2024", quantity=1, unit_price=18000.0, total=18000.0, tax_rate=18.0),
+            ],
+            subtotal=108000.0,
+            tax_amount=19440.0,
+            tax_rate=18.0,
+            total_amount=127440.0,
+            bank_account="1122334455",
+            bank_name="ICICI Bank",
+            ifsc_code="ICIC0005678",
+            supplier_gstin="27AABCF5678G1Z3",
+            supplier_email="billing@fastmove.in",
+        )
+    def get_grn(self) -> GoodsReceiptNote:
+        return GoodsReceiptNote(
+            grn_number="GRN-2024-0740",
+            po_reference="PO-2024-0778",
+            receipt_date="2024-02-28",
+            items_received=[
+                {"description": "Mumbai-Pune Transport", "quantity_received": 20, "quantity_pending": 0, "quantity_rejected": 0, "service_confirmed": True},
+                {"description": "Warehousing charges Feb 2024", "quantity_received": 1, "quantity_pending": 0, "quantity_rejected": 0, "service_confirmed": True},
+            ],
+            receiving_officer="Priya Sharma",
+            notes="All transport trips completed. Warehousing service confirmed for February.",
+        )
+    def get_supplier_master(self) -> SupplierMaster:
+        return SupplierMaster(
+            supplier_id="SUP-0229",
+            supplier_name="FastMove Logistics",
+            registered_address="12 Logistics Park, Navi Mumbai 400710",
+            gstin="27AABCF5678G1Z3",
+            bank_account="1122334455",
+            bank_name="ICICI Bank",
+            ifsc_code="ICIC0005678",
+            contact_email="accounts@fastmove.in",
+            contact_phone="+91-22-3456-7890",
+            registered_domain="fastmove.in",
+            pan_number="AABCF5678G",
+            status="active",
+        )
+    def get_exception_flag(self) -> ExceptionFlag:
+        return ExceptionFlag(
+            flag_code="POSSIBLE_DUPLICATE",
+            flag_description="Invoice INV-2024-891 closely matches previously processed invoice INV-2024-819. Possible duplicate submission.",
+            auto_hold=True,
+            flagged_date="2024-03-13",
+            severity="high",
+        )
+    @property
+    def available_checks(self) -> List[str]:
+        return ["duplicate_detection", "tax_calculation_verify", "grn_match",
+                "bank_account_verification", "gst_verification", "po_match"]
+    @property
+    def available_rules(self) -> List[str]:
+        return ["partial_approval", "credit_note_request", "full_rejection",
+                "duplicate_block", "tax_correction"]
+    @property
+    def knowledge_base(self) -> List[str]:
+        return [
+            "POL-005: Duplicate invoices must be rejected unless they represent a legitimate correction.",
+            "POL-006: Tax calculation errors on paid invoices require a credit note and correction entry.",
+            "POL-007: Partial approval may be used when only a portion of the invoice amount is valid.",
+            "POL-008: Any tax correction must be documented with the original invoice reference.",
+        ]
+    def simulate_inspect(self, document: str, field: str) -> Tuple[InspectionResult, float]:
+        key_fields = {
+            ("invoice", "invoice_number"): ("INV-2024-891 — note digit transposition vs INV-2024-819 (891 vs 819)", 0.10),
+            ("invoice", "tax_amount"): ("₹19,440 (18% GST on ₹1,08,000) — this is the CORRECT rate", 0.08),
+            ("invoice", "total_amount"): ("₹1,27,440 (subtotal ₹1,08,000 + 18% GST ₹19,440)", 0.05),
+            ("invoice", "line_items"): ("Transport 20×₹4,500 = ₹90,000 + Warehousing ₹18,000 = ₹1,08,000", 0.04),
+        }
+        key = (document.lower(), field.lower())
+        value, reward = key_fields.get(key, (f"{document}.{field} — no anomaly detected", 0.01))
+        return InspectionResult(document=document, field=field, value=value, note=""), reward
+    def simulate_cross_check(self, field: str, doc_a: str, doc_b: str) -> Tuple[CheckResult, float]:
+        checks = {
+            ("invoice_number", "invoice", "payment_history"): (False, "MATCH FOUND: INV-2024-819 paid 12 days ago for ₹1,24,200. Digit transposition: 891 vs 819.", 0.15),
+            ("tax_amount", "invoice", "payment_history"): (False, "TAX DISCREPANCY: Original INV-2024-819 had 15% GST (₹16,200). Current INV-2024-891 has 18% GST (₹19,440). Delta: ₹3,240.", 0.14),
+            ("total_amount", "invoice", "po"): (True, "Invoice subtotal ₹1,08,000 matches PO total ₹1,08,000.", 0.03),
+            ("bank_account", "invoice", "supplier_master"): (True, "Bank account matches supplier master.", 0.02),
+        }
+        key = (field.lower(), doc_a.lower(), doc_b.lower())
+        passed, detail, reward = checks.get(key, (True, f"No mismatch for {field}.", 0.01))
+        return CheckResult(check_name=f"cross_{field}_{doc_a}_{doc_b}", passed=passed, detail=detail), reward
+    def simulate_run_check(self, check_name: str) -> Tuple[CheckResult, float]:
+        checks = {
+            "duplicate_detection": (False, "DUPLICATE FOUND: INV-2024-891 matches INV-2024-819 (paid 12 days ago, ₹1,24,200). Invoice numbers differ by digit transposition (891 vs 819).", 0.18),
+            "tax_calculation_verify": (False, "TAX ERROR on ORIGINAL: INV-2024-819 applied 15% GST (₹16,200) instead of correct 18% (₹19,440). Company overpaid ₹3,240 in tax on already-paid invoice.", 0.16),
+            "grn_match": (True, "Services fully confirmed. GRN matches invoice.", 0.04),
+            "bank_account_verification": (True, "Bank account matches supplier master.", 0.02),
+            "gst_verification": (True, "GSTIN matches supplier master.", 0.02),
+            "po_match": (True, "PO amounts and line items match current invoice.", 0.03),
+        }
+        passed, detail, reward = checks.get(check_name, (True, f"Check '{check_name}' passed.", 0.01))
+        return CheckResult(check_name=check_name, passed=passed, detail=detail), reward
+    def simulate_query_supplier(self, question: str, channel: str) -> Tuple[QueryResult, float]:
+        response = (
+            "We are aware that INV-2024-819 was submitted with incorrect 15% GST. The correct rate "
+            "is 18%. INV-2024-891 is a corrected resubmission. We request partial approval for the "
+            "₹3,240 tax differential only, not the full invoice amount. We will issue a credit note "
+            "for the remaining amount."
+        )
+        return QueryResult(target="supplier", question=question, response=response, channel=channel), 0.10
+    def simulate_query_internal(self, department: str, question: str) -> Tuple[QueryResult, float]:
+        if department.lower() == "finance":
+            response = (
+                "Confirmed: INV-2024-819 was paid on March 1 for ₹1,24,200 (₹1,08,000 + 15% GST of "
+                "₹16,200). The correct GST rate for logistics services is 18%. We overpaid — the "
+                "correct total should have been ₹1,27,440. The tax differential is ₹3,240. This "
+                "can be corrected via partial approval of the new invoice for ₹3,240 only."
+            )
+            return QueryResult(target="finance", question=question, response=response, channel="internal"), 0.12
+        response = f"{department.title()}: No specific information available."
+        return QueryResult(target=department.lower(), question=question, response=response, channel="internal"), 0.03
+    def simulate_apply_rule(self, rule_id: str) -> Tuple[str, float]:
+        rules = {
+            "partial_approval": ("APPLIED: Partial approval for ₹3,240 (tax correction delta). Main invoice amount blocked as duplicate.", 0.12),
+            "credit_note_request": ("APPLIED: Credit note requested from supplier for balance amount. Reference: INV-2024-819 tax correction.", 0.10),
+            "full_rejection": ("APPLIED: Full rejection. Invoice returned to supplier.", -0.05),
+            "duplicate_block": ("APPLIED: Duplicate block activated. Full payment prevented.", 0.04),
+            "tax_correction": ("APPLIED: Tax correction entry created referencing original INV-2024-819.", 0.08),
+        }
+        detail, reward = rules.get(rule_id, (f"Rule '{rule_id}' not found.", -0.03))
+        return detail, reward
+    def simulate_make_decision(self, decision: str, reason: str, ep: EpisodeData) -> float:
+        checks_run = {c.check_name for c in ep.checks}
+        dup_found = "duplicate_detection" in checks_run
+        tax_found = "tax_calculation_verify" in checks_run
+        if decision == "partial_approve":
+            if dup_found and tax_found:
+                return 0.28
+            elif dup_found:
+                return 0.14
+            return 0.06
+        elif decision == "reject":
+            if dup_found:
+                return 0.08
+            return 0.02
+        elif decision == "approve":
+            return -0.15
+        elif decision == "hold":
+            return 0.06
+        return 0.0
+    def simulate_route_to(self, team: str, notes: str, ep: EpisodeData) -> float:
+        routes = {"finance": 0.08, "procurement": 0.03, "legal": 0.02}
+        return routes.get(team.lower(), 0.0)
+    def simulate_close(self, summary: str, ep: EpisodeData) -> float:
+        if ep.decision == "partial_approve" and ep.closed is False:
+            return 0.06
+        elif ep.decision is not None:
+            return 0.03
+        return 0.0
+    def grade(self, ep: EpisodeData) -> Dict[str, float]:
+        checks_run = {c.check_name for c in ep.checks}
+        queries_to = {q.target for q in ep.queries}
+        # Diagnosis (max 0.30)
+        d = 0.0
+        if "duplicate_detection" in checks_run:
+            d += 0.16
+        if "tax_calculation_verify" in checks_run:
+            d += 0.14
+        # Investigation (max 0.32)
+        i = 0.0
+        if "finance" in queries_to:
+            i += 0.12
+        if "supplier" in queries_to:
+            i += 0.10
+        if "partial_approval" in ep.rules_applied:
+            i += 0.06
+        if "credit_note_request" in ep.rules_applied:
+            i += 0.04
+        # Decision (max 0.20)
+        dec = 0.0
+        if ep.decision == "partial_approve":
+            dec = 0.20
+        elif ep.decision == "reject":
+            dec = 0.05
+        elif ep.decision == "approve":
+            dec = -0.15
+        elif ep.decision == "hold":
+            dec = 0.04
+        # Routing (max 0.08)
+        route = 0.08 if "finance" in ep.routed_to else 0.0
+        # Closure (max 0.06)
+        closure = 0.06 if ep.closed else 0.0
+        # Efficiency
+        eff = max(0.0, 0.04 - 0.003 * max(0, ep.step_count - 10))
+        total = d + i + dec + route + closure + eff
+        return {
+            "score": round(max(0.0, min(1.0, total)), 4),
+            "diagnosis_score": round(d, 4),
+            "investigation_score": round(i, 4),
+            "decision_score": round(dec, 4),
+            "routing_score": round(route, 4),
+            "closure_score": round(closure, 4),
+            "efficiency_score": round(eff, 4),
+        }
+# ---------------------------------------------------------------------------
+# Task 3 — Compound Fraud Signals (Hard)
+# ---------------------------------------------------------------------------
+class CompoundFraudTask(BaseTask):
+    """
+    IT supplier submits ₹8,47,500 invoice for 15 laptops. System flags a bank
+    account change. But there are FOUR simultaneous fraud signals: bank BEC,
+    GSTIN mismatch, quantity mismatch (13 vs 15), and price inflation (8.65%).
+    Critical trap: querying supplier via email contacts the fraudster (-0.15).
+    Must use phone to reach real supplier (+0.15).
+    """
+    task_id = "task3_compound_fraud"
+    max_steps = 25
+    difficulty = "hard"
+    def get_purchase_order(self) -> PurchaseOrder:
+        return PurchaseOrder(
+            po_number="PO-2024-0955",
+            vendor_name="TechCore Solutions",
+            po_date="2024-03-08",
+            line_items=[
+                LineItem(description="Business Laptop (14-inch, i7, 16GB)", quantity=15, unit_price=52000.0, total=780000.0, tax_rate=18.0),
+            ],
+            total_amount=780000.0,
+            payment_terms="Net-30",
+        )
+    def get_invoice(self) -> Invoice:
+        return Invoice(
+            invoice_number="INV-TC-2024-0312",
+            supplier_name="TechCore Solutions",
+            invoice_date="2024-03-10",
+            due_date="2024-04-09",
+            po_reference="PO-2024-0955",
+            line_items=[
+                LineItem(description="Business Laptop (14-inch, i7, 16GB)", quantity=15, unit_price=56500.0, total=847500.0, tax_rate=18.0),
+            ],
+            subtotal=847500.0,
+            tax_amount=152550.0,
+            tax_rate=18.0,
+            total_amount=1000050.0,
+            bank_account="5566778899",
+            bank_name="Yes Bank",
+            ifsc_code="YESB0000999",
+            supplier_gstin="07AABCT9999X1Z8",
+            supplier_email="accounts@techcore-solutions.com",
+        )
+    def get_grn(self) -> GoodsReceiptNote:
+        return GoodsReceiptNote(
+            grn_number="GRN-2024-0901",
+            po_reference="PO-2024-0955",
+            receipt_date="2024-03-15",
+            items_received=[
+                {"description": "Business Laptop (14-inch, i7, 16GB)", "quantity_received": 13, "quantity_pending": 2, "quantity_rejected": 0},
+            ],
+            receiving_officer="Vikram Singh",
+            notes="13 of 15 laptops received. 2 units still in transit.",
+        )
+    def get_supplier_master(self) -> SupplierMaster:
+        return SupplierMaster(
+            supplier_id="SUP-0187",
+            supplier_name="TechCore Solutions",
+            registered_address="88 Tech Park, Sector 62, Noida 201301",
+            gstin="07AABCT1234Y1Z5",
+            bank_account="1234567890",
+            bank_name="State Bank of India",
+            ifsc_code="SBIN0001234",
+            contact_email="sales@techcore-solutions.in",
+            contact_phone="+91-120-456-7890",
+            registered_domain="techcore-solutions.in",
+            pan_number="AABCT1234Y",
+            status="active",
+        )
+    def get_exception_flag(self) -> ExceptionFlag:
+        return ExceptionFlag(
+            flag_code="BANK_ACCOUNT_CHANGE",
+            flag_description=(
+                "Invoice bank account (5566778899, Yes Bank) does not match supplier master "
+                "(1234567890, SBI). Bank account change request received from "
+                "accounts@techcore-solutions.com."
+            ),
+            auto_hold=True,
+            flagged_date="2024-03-16",
+            severity="critical",
+        )
+    @property
+    def available_checks(self) -> List[str]:
+        return ["bank_account_verification", "gst_verification", "grn_match",
+                "email_domain_verification", "invoice_date_validation",
+                "quantity_check", "price_check", "duplicate_detection", "po_match"]
+    @property
+    def available_rules(self) -> List[str]:
+        return ["fraud_hold", "rejection_with_reason", "bank_change_verification",
+                "escalate_to_security"]
+    @property
+    def knowledge_base(self) -> List[str]:
+        return [
+            "POL-004: Bank account on invoice must match supplier master.",
+            "POL-009: Bank account change must be verified via registered phone number — NEVER via email.",
+            "POL-010: GSTIN on invoice must match supplier master. Mismatch is a fraud indicator.",
+            "POL-011: Invoice quantities must not exceed GRN quantities.",
+            "POL-012: Any suspected fraud must be escalated to Legal and Security teams.",
+            "POL-013: Do not process payment while fraud investigation is pending.",
+        ]
+    def simulate_inspect(self, document: str, field: str) -> Tuple[InspectionResult, float]:
+        key_fields = {
+            ("invoice", "bank_account"): ("5566778899 (Yes Bank) — DOES NOT MATCH supplier master (1234567890, SBI)", 0.12),
+            ("invoice", "supplier_gstin"): ("07AABCT9999X1Z8 — DOES NOT MATCH supplier master (07AABCT1234Y1Z5)", 0.10),
+            ("invoice", "supplier_email"): ("accounts@techcore-solutions.com — domain is .com, registered domain is .in", 0.08),
+            ("grn", "items_received"): ("13 of 15 laptops received. 2 pending delivery.", 0.08),
+            ("invoice", "line_items"): ("15 laptops @ ₹56,500 = ₹8,47,500. PO price was ₹52,000/unit.", 0.06),
+            ("invoice", "invoice_date"): ("2024-03-10 (Sunday) — unusual for B2B invoicing", 0.04),
+            ("invoice", "total_amount"): ("₹10,00,050 (₹8,47,500 + 18% GST ₹1,52,550)", 0.03),
+        }
+        key = (document.lower(), field.lower())
+        value, reward = key_fields.get(key, (f"{document}.{field} — value noted", 0.01))
+        return InspectionResult(document=document, field=field, value=value, note=""), reward
+    def simulate_cross_check(self, field: str, doc_a: str, doc_b: str) -> Tuple[CheckResult, float]:
+        checks = {
+            ("bank_account", "invoice", "supplier_master"): (False, "MISMATCH: Invoice has 5566778899 (Yes Bank). Supplier master has 1234567890 (SBI). Change request from lookalike domain.", 0.14),
+            ("gstin", "invoice", "supplier_master"): (False, "MISMATCH: Invoice GSTIN 07AABCT9999X1Z8 belongs to 'TechCore Trading Pvt Ltd' (different entity). Supplier master: 07AABCT1234Y1Z5.", 0.14),
+            ("quantity", "invoice", "grn"): (False, "MISMATCH: Invoice claims 15 units. GRN shows only 13 received, 2 pending.", 0.10),
+            ("unit_price", "invoice", "po"): (False, "MISMATCH: Invoice ₹56,500/unit vs PO ₹52,000/unit. Variance: +8.65%. No approved revision.", 0.08),
+        }
+        key = (field.lower(), doc_a.lower(), doc_b.lower())
+        passed, detail, reward = checks.get(key, (True, f"No mismatch for {field}.", 0.01))
+        return CheckResult(check_name=f"cross_{field}_{doc_a}_{doc_b}", passed=passed, detail=detail), reward
+    def simulate_run_check(self, check_name: str) -> Tuple[CheckResult, float]:
+        checks = {
+            "bank_account_verification": (False, "FAILED: Bank account mismatch. Change request from techcore-solutions.com (lookalike of registered .in domain). Suspected BEC attack.", 0.18),
+            "gst_verification": (False, "FAILED: GSTIN 07AABCT9999X1Z8 registered to 'TechCore Trading Pvt Ltd' in Delhi — a DIFFERENT entity. Supplier master shows 07AABCT1234Y1Z5 for 'TechCore Solutions'.", 0.18),
+            "grn_match": (False, "FAILED: Invoice claims 15 laptops. GRN confirms only 13 received. 2 units still in transit.", 0.14),
+            "email_domain_verification": (False, "FAILED: Invoice email domain techcore-solutions.com does not match registered domain techcore-solutions.in. Lookalike domain — possible BEC.", 0.16),
+            "invoice_date_validation": (False, "WARNING: Invoice dated 2024-03-10 is a Sunday. Unusual for B2B invoicing.", 0.08),
+            "quantity_check": (False, "FAILED: Invoiced quantity (15) exceeds received quantity (13). 2 units undelivered.", 0.12),
+            "price_check": (False, "FAILED: Unit price ₹56,500 is 8.65% above PO price ₹52,000. No price revision approved.", 0.10),
+            "duplicate_detection": (True, "No duplicate invoice found.", 0.02),
+            "po_match": (False, "FAILED: Multiple mismatches — GSTIN, quantity, and unit price all differ from PO/supplier master.", 0.08),
+        }
+        passed, detail, reward = checks.get(check_name, (True, f"Check '{check_name}' passed.", 0.01))
+        return CheckResult(check_name=check_name, passed=passed, detail=detail), reward
+    def simulate_query_supplier(self, question: str, channel: str) -> Tuple[QueryResult, float]:
+        """Critical trap: email contacts the fraudster, phone reaches real supplier."""
+        if channel.lower() in ("email", "mail"):
+            response = (
+                "Dear Customer, thank you for reaching out. We recently changed our banking details "
+                "due to a corporate restructuring. The new account (5566778899, Yes Bank) is correct. "
+                "Please process the payment at your earliest convenience. We can provide updated "
+                "KYC documents if needed. — TechCore Solutions Finance Team"
+            )
+            return QueryResult(target="supplier", question=question, response=response, channel=channel), -0.15
+        else:
+            response = (
+                "This is Rajesh from TechCore Solutions. We have NOT sent any bank account change "
+                "request. Our bank account is still 1234567890 with SBI. We also have not sent any "
+                "invoice from techcore-solutions.com — our domain is techcore-solutions.in. This "
+                "looks like a fraud attempt. Please do NOT process payment to the new account. "
+                "We will file a complaint with our legal team."
+            )
+            return QueryResult(target="supplier", question=question, response=response, channel=channel), 0.15
+    def simulate_query_internal(self, department: str, question: str) -> Tuple[QueryResult, float]:
+        dept = department.lower()
+        if dept == "security":
+            response = (
+                "Security Team: We will initiate a BEC investigation. The lookalike domain "
+                "techcore-solutions.com was registered 3 days before the invoice date. This is "
+                "a classic Business Email Compromise pattern. Do NOT process any payment. "
+                "We are preserving email headers for forensic analysis."
+            )
+            return QueryResult(target="security", question=question, response=response, channel="internal"), 0.10
+        elif dept == "legal":
+            response = (
+                "Legal Team: Based on the fraud indicators you've documented, we recommend: "
+                "1) Immediate payment block, 2) Formal complaint to cybercrime authorities, "
+                "3) Supplier audit of TechCore Solutions, 4) Review of all recent invoices "
+                "from this supplier."
+            )
+            return QueryResult(target="legal", question=question, response=response, channel="internal"), 0.08
+        elif dept == "finance":
+            response = "Finance: Payment has been blocked pending investigation. No funds released."
+            return QueryResult(target="finance", question=question, response=response, channel="internal"), 0.04
+        elif dept == "procurement":
+            response = "Procurement: PO-2024-0955 was raised on March 8. Standard 2-day processing for IT equipment."
+            return QueryResult(target="procurement", question=question, response=response, channel="internal"), 0.03
+        response = f"{department.title()}: No specific information available."
+        return QueryResult(target=dept, question=question, response=response, channel="internal"), 0.02
+    def simulate_apply_rule(self, rule_id: str) -> Tuple[str, float]:
+        rules = {
+            "fraud_hold": ("APPLIED: Fraud hold activated. All payments to this supplier frozen pending investigation.", 0.10),
+            "rejection_with_reason": ("APPLIED: Invoice rejected with documented fraud signals.", 0.06),
+            "bank_change_verification": ("APPLIED: Bank change verification protocol activated. Phone verification required per POL-009.", 0.08),
+            "escalate_to_security": ("APPLIED: Case escalated to Information Security team for BEC investigation.", 0.08),
+        }
+        detail, reward = rules.get(rule_id, (f"Rule '{rule_id}' not applicable.", -0.03))
+        return detail, reward
+    def simulate_make_decision(self, decision: str, reason: str, ep: EpisodeData) -> float:
+        checks_run = {c.check_name for c in ep.checks}
+        signals = sum([
+            "bank_account_verification" in checks_run,
+            "gst_verification" in checks_run,
+            "grn_match" in checks_run,
+            "email_domain_verification" in checks_run,
+        ])
+        if decision == "reject":
+            return 0.10 + 0.05 * signals
+        elif decision == "approve":
+            return -0.40
+        elif decision == "partial_approve":
+            return -0.20
+        elif decision == "hold":
+            return 0.08 + 0.03 * signals
+        return 0.0
+    def simulate_route_to(self, team: str, notes: str, ep: EpisodeData) -> float:
+        routes = {"legal": 0.14, "security": 0.12, "finance": 0.08, "procurement": 0.06}
+        return routes.get(team.lower(), 0.0)
+    def simulate_close(self, summary: str, ep: EpisodeData) -> float:
+        if ep.closed is False and ep.decision == "reject":
+            return 0.06
+        return 0.0
+    def grade(self, ep: EpisodeData) -> Dict[str, float]:
+        checks_run = {c.check_name for c in ep.checks}
+        bank_found = "bank_account_verification" in checks_run
+        gst_found = "gst_verification" in checks_run
+        qty_found = "grn_match" in checks_run
+        domain_found = "email_domain_verification" in checks_run
+        price_found = "price_check" in checks_run
+        # Diagnosis
+        d = ((0.12 if bank_found else 0) + (0.12 if gst_found else 0)
+             + (0.10 if qty_found else 0) + (0.10 if domain_found else 0)
+             + (0.06 if price_found else 0))
+        # Investigation — reward phone, penalise email
+        i = 0.0
+        for q in ep.queries:
+            if q.target == "supplier" and q.channel not in ("email", "mail"):
+                i += 0.10
+            elif q.target == "supplier" and q.channel in ("email", "mail"):
+                i -= 0.15
+        if "legal" in {q.target for q in ep.queries}:
+            i += 0.06
+        if "security" in {q.target for q in ep.queries}:
+            i += 0.06
+        # Decision
+        signals = sum([bank_found, gst_found, qty_found, domain_found])
+        dec = 0.0
+        if ep.decision == "reject":
+            dec = 0.08 + 0.03 * signals
+        elif ep.decision == "approve":
+            dec = -0.35
+        elif ep.decision == "partial_approve":
+            dec = -0.15
+        elif ep.decision == "hold":
+            dec = 0.06
+        # Routing
+        routes = set(ep.routed_to)
+        route = ((0.10 if "legal" in routes else 0)
+                 + (0.06 if "security" in routes else 0)
+                 + (0.04 if "finance" in routes else 0))
+        # Closure
+        closure = 0.06 if (ep.closed and ep.decision == "reject") else 0.0
+        # Efficiency
+        eff = max(0.0, 0.04 - 0.002 * max(0, ep.step_count - 12))
+        total = d + i + dec + route + closure + eff
+        return {
+            "score": round(max(0.0, min(1.0, total)), 4),
+            "signals_found": sum([bank_found, gst_found, qty_found, domain_found, price_found]),
+            "diagnosis_score": round(d, 4),
+            "investigation_score": round(i, 4),
+            "decision_score": round(dec, 4),
+            "routing_score": round(route, 4),
+            "closure_score": round(closure, 4),
+            "efficiency_score": round(eff, 4),
+        }
+# ---------------------------------------------------------------------------
+# Task Registry
+# ---------------------------------------------------------------------------
+TASK_REGISTRY: Dict[str, type] = {
+    "task1_price_variance": PriceVarianceTask,
+    "task2_duplicate_tax": DuplicateTaxErrorTask,
+    "task3_compound_fraud": CompoundFraudTask,
+}
+ALL_TASKS = list(TASK_REGISTRY.keys())
+def make_task(task_id: str) -> BaseTask:
+    """Instantiate a task by its ID. Raises ValueError for unknown IDs."""
+    cls = TASK_REGISTRY.get(task_id)
+    if cls is None:
+        raise ValueError(f"Unknown task '{task_id}'. Available: {ALL_TASKS}")
+    return cls()

inference.py ADDED Viewed

	@@ -0,0 +1,252 @@

+"""
+Competition inference script for the Invoice Exception Handler environment.
+Uses the OpenAI client to call an LLM that acts as an AP analyst.
+Reads API_BASE_URL, MODEL_NAME, HF_TOKEN from environment variables.
+Emits [START], [STEP], [END] lines to stdout as required by the spec.
+Usage:
+    export API_BASE_URL="https://router.huggingface.co/v1"
+    export MODEL_NAME="Qwen/Qwen2.5-72B-Instruct"
+    export HF_TOKEN="your-token"
+    python inference.py
+"""
+from __future__ import annotations
+import json
+import os
+import re
+import sys
+from openai import OpenAI
+from env import InvoiceExceptionEnv, Action, ALL_TASKS
+# ---------------------------------------------------------------------------
+# Configuration from environment variables
+# ---------------------------------------------------------------------------
+API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
+MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
+API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY", "")
+# ---------------------------------------------------------------------------
+# System prompt — tells the LLM how to act
+# ---------------------------------------------------------------------------
+SYSTEM_PROMPT = """You are an expert Accounts Payable (AP) analyst handling flagged invoice exceptions.
+You have access to a document packet: Purchase Order (PO), Invoice, Goods Receipt Note (GRN), Supplier Master, and an Exception Flag explaining why this invoice was flagged.
+You must investigate the root cause, apply business rules, make a decision, and close the case.
+**Your action space** (respond with exactly ONE JSON action per turn):
+1. inspect_field: {"type": "inspect_field", "params": {"document": "invoice|po|grn|supplier_master", "field": "field_name"}}
+2. cross_check: {"type": "cross_check", "params": {"field": "field_name", "doc_a": "doc1", "doc_b": "doc2"}}
+3. run_check: {"type": "run_check", "params": {"check_name": "check_name"}}
+4. query_supplier: {"type": "query_supplier", "params": {"question": "your question", "channel": "phone|email"}}
+5. query_internal: {"type": "query_internal", "params": {"department": "dept_name", "question": "your question"}}
+6. apply_rule: {"type": "apply_rule", "params": {"rule_id": "rule_id"}}
+7. make_decision: {"type": "make_decision", "params": {"decision": "approve|reject|hold|partial_approve", "reason": "explanation"}}
+8. route_to: {"type": "route_to", "params": {"team": "team_name", "notes": "routing notes"}}
+9. close_case: {"type": "close_case", "params": {"summary": "audit trail summary"}}
+**Rules:**
+- Always investigate before making a decision
+- Never approve without running checks first
+- If fraud is suspected, NEVER contact the supplier via email — use phone only
+- Respond with ONLY a JSON object, no extra text
+"""
+# ---------------------------------------------------------------------------
+# Prompt builder
+# ---------------------------------------------------------------------------
+def build_prompt(obs, step: int, max_steps: int, history: list) -> str:
+    """Build the user prompt from the current observation state."""
+    lines = [
+        f"Step {step} of {max_steps}.",
+        f"",
+        f"EXCEPTION FLAG: {obs.exception_flag.flag_code} — {obs.exception_flag.flag_description}",
+        f"",
+        f"Available checks: {', '.join(obs.available_checks)}",
+        f"Available rules: {', '.join(obs.available_rules)}",
+        f"",
+        f"Knowledge base:",
+    ]
+    for entry in obs.knowledge_base:
+        lines.append(f"  - {entry}")
+    lines.append("")
+    lines.append(f"Cumulative reward so far: {obs.cumulative_reward:.2f}")
+    lines.append(f"Case status: {obs.case_status}")
+    if obs.checks_run:
+        lines.append(f"Checks already run: {', '.join(c.check_name for c in obs.checks_run)}")
+    if obs.queries:
+        lines.append(f"Queries made: {', '.join(q.target for q in obs.queries)}")
+    if obs.inspections:
+        lines.append(f"Fields inspected: {', '.join(f'{i.document}.{i.field}' for i in obs.inspections)}")
+    if obs.rules_applied:
+        lines.append(f"Rules applied: {', '.join(obs.rules_applied)}")
+    if obs.decision:
+        lines.append(f"Decision made: {obs.decision}")
+    if obs.routed_to:
+        lines.append(f"Routed to: {', '.join(obs.routed_to)}")
+    if history:
+        lines.append("")
+        lines.append("Recent history:")
+        for h in history[-5:]:
+            lines.append(f"  {h}")
+    lines.append("")
+    lines.append("What is your next action? Respond with a single JSON object.")
+    return "\n".join(lines)
+# ---------------------------------------------------------------------------
+# LLM caller
+# ---------------------------------------------------------------------------
+def call_llm(client: OpenAI, user_prompt: str) -> str:
+    """Call the LLM and return its raw text response."""
+    try:
+        response = client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user", "content": user_prompt},
+            ],
+            temperature=0.1,
+            max_tokens=256,
+        )
+        return response.choices[0].message.content or ""
+    except Exception as e:
+        print(f"LLM call failed: {e}", file=sys.stderr)
+        return '{"type": "run_check", "params": {"check_name": "po_match"}}'
+# ---------------------------------------------------------------------------
+# Action parser
+# ---------------------------------------------------------------------------
+def parse_action(raw_text: str) -> dict:
+    """
+    Parse the model's response into an action dict.
+    Handles markdown code fences, extra whitespace, and minor formatting errors.
+    Falls back to run_check(po_match) if parsing fails.
+    """
+    text = raw_text.strip()
+    # Remove ```json or ``` fences if present
+    if text.startswith("```"):
+        lines = text.split("\n")
+        text = "\n".join(lines[1:-1] if lines[-1].strip() == "```" else lines[1:])
+    try:
+        return json.loads(text.strip())
+    except json.JSONDecodeError:
+        pass
+    # Try to find JSON within the text
+    match = re.search(r'\{.*\}', text, re.DOTALL)
+    if match:
+        try:
+            return json.loads(match.group())
+        except json.JSONDecodeError:
+            pass
+    # Safe fallback
+    return {"type": "run_check", "params": {"check_name": "po_match"}}
+# ---------------------------------------------------------------------------
+# Task runner
+# ---------------------------------------------------------------------------
+def run_task(client: OpenAI, env: InvoiceExceptionEnv, task_id: str, max_steps: int = 20) -> tuple:
+    """Run one task episode and return (steps_taken, score, rewards)."""
+    rewards = []
+    print(f"[START] task={task_id} env=invoice-exception-handler model={MODEL_NAME}", flush=True)
+    obs = env.reset(task_id)
+    history = []
+    for step in range(1, max_steps + 1):
+        # Build prompt from observation
+        user_prompt = build_prompt(obs, step, max_steps, history)
+        # Call LLM
+        raw = call_llm(client, user_prompt)
+        action_dict = parse_action(raw)
+        # Execute
+        try:
+            result = env.step(action_dict)
+            reward = result.reward
+            done = result.done
+            error = None
+        except Exception as e:
+            reward = 0.0
+            done = False
+            error = str(e)
+            result = None
+        rewards.append(reward)
+        action_str = json.dumps(action_dict)
+        print(
+            f"[STEP] step={step} action={action_str} "
+            f"reward={reward:.2f} done={str(done).lower()} "
+            f"error={error or 'null'}",
+            flush=True,
+        )
+        history.append(f"Step {step}: {action_str} -> reward {reward:+.2f}")
+        if result:
+            obs = result.observation
+        if done:
+            break
+    score = env.grade()["score"]
+    success = score >= 0.5
+    steps_taken = min(step, max_steps)
+    rewards_str = ",".join(f"{r:.2f}" for r in rewards)
+    print(
+        f"[END] success={str(success).lower()} steps={steps_taken} "
+        f"score={score:.3f} rewards={rewards_str}",
+        flush=True,
+    )
+    return steps_taken, score, rewards
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+def main() -> None:
+    """Run inference on all tasks."""
+    client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
+    env = InvoiceExceptionEnv(seed=42)
+    all_scores = []
+    for task_id in ALL_TASKS:
+        _, score, _ = run_task(client, env, task_id)
+        all_scores.append(score)
+    avg = sum(all_scores) / len(all_scores) if all_scores else 0.0
+    print(f"\nAverage score: {avg:.3f}", flush=True)
+if __name__ == "__main__":
+    main()

openenv.yaml ADDED Viewed

	@@ -0,0 +1,160 @@

+# openenv.yaml
+name: Invoice Exception Handler
+version: "1.0.0"
+description: |
+  An agent learning environment simulating accounts payable exception handling.
+  The agent acts as an AP analyst: investigates flagged invoices, applies business
+  rules, detects fraud signals, makes decisions, and closes cases with an audit trail.
+authors:
+  - name: Moahmmed Yusuf, Nadella Harshith
+    email: [yusufindian09@gmail.com] [nadellaharshith4@gmail.com]
+license: MIT
+tasks:
+  - id: task1_price_variance
+    name: Price Variance Exception
+    difficulty: easy
+    description: |
+      Office stationery invoice arrives 3.08% above PO. Company tolerance policy
+      allows +/-2% auto-approval. Agent must detect the variance, verify through
+      the tolerance rule, confirm verbal approval with procurement, and approve
+      with a PO amendment request.
+    max_steps: 18
+    optimal_score: 1.0
+    min_passing_score: 0.60
+  - id: task2_duplicate_tax
+    name: Duplicate Invoice with Tax Error
+    difficulty: medium
+    description: |
+      Logistics supplier submits INV-2024-891, a duplicate of paid INV-2024-819
+      (digit transposition: 891 vs 819). Original invoice had wrong GST rate (15%
+      vs correct 18%) — company overpaid 3,240 INR. New invoice has correct rate.
+      Agent must detect the duplicate, identify the tax error in the original,
+      and partially approve only the 3,240 INR tax correction.
+    max_steps: 20
+    optimal_score: 1.0
+    min_passing_score: 0.50
+  - id: task3_compound_fraud
+    name: Compound Fraud Signals
+    difficulty: hard
+    description: |
+      IT equipment supplier invoice with four simultaneous fraud signals: bank
+      account changed via BEC attack (lookalike email domain), GSTIN belongs to
+      a different entity, 2 of 15 laptops not yet received, and unit price 8.65%
+      above PO. Agent must find all signals, use the correct communication channel
+      (phone, not email — which would contact the fraudster), and escalate to legal
+      and security.
+    max_steps: 25
+    optimal_score: 1.0
+    min_passing_score: 0.40
+observation_space:
+  type: object
+  description: EnvironmentState Pydantic model
+  fields:
+    task_id:             {type: string}
+    step_number:         {type: integer}
+    case_status:         {type: string, enum: [open, in_review, decided, routed, closed]}
+    purchase_order:      {type: object, description: "PO with line items and terms"}
+    invoice:             {type: object, description: "Supplier invoice with line items and tax"}
+    grn:                 {type: object, description: "Goods receipt — what actually arrived"}
+    supplier_master:     {type: object, description: "Verified supplier record"}
+    exception_flag:      {type: object, description: "Why the system flagged this invoice"}
+    inspections:         {type: array, description: "Fields the agent has inspected"}
+    checks_run:          {type: array, description: "Validation checks completed"}
+    queries:             {type: array, description: "Internal and supplier queries"}
+    rules_applied:       {type: array, description: "Business rules applied"}
+    decision:            {type: string, nullable: true}
+    routed_to:           {type: array}
+    available_actions:   {type: array}
+    available_checks:    {type: array}
+    available_rules:     {type: array}
+    knowledge_base:      {type: array}
+    cumulative_reward:   {type: number}
+action_space:
+  type: object
+  description: Action with type and params
+  actions:
+    inspect_field:
+      params: {document: string, field: string}
+    cross_check:
+      params: {field: string, doc_a: string, doc_b: string}
+    run_check:
+      params: {check_name: string}
+    query_supplier:
+      params: {question: string, channel: string}
+    query_internal:
+      params: {department: string, question: string}
+    apply_rule:
+      params: {rule_id: string}
+    make_decision:
+      params: {decision: string, reason: string}
+    route_to:
+      params: {team: string, notes: string}
+    close_case:
+      params: {summary: string}
+reward:
+  range: [-1.0, 1.0]
+  description: |
+    Shaped reward at every step. Relevant inspections: +0.01 to +0.14.
+    Diagnostics revealing issues: +0.08 to +0.18. Correct fixes: +0.08 to +0.30.
+    Wrong decision on fraud: -0.15 to -0.40. Repeat actions: -0.02 to -0.05.
+    SLA breach: -0.10.
+grading:
+  method: task_grader
+  scores:
+    - score
+    - diagnosis_score
+    - investigation_score
+    - decision_score
+    - routing_score
+    - closure_score
+    - efficiency_score
+api:
+  reset:
+    signature: "reset(task_id: str | None = None) -> EnvironmentState"
+  step:
+    signature: "step(action: Action | dict) -> StepResult"
+  state:
+    signature: "state() -> EnvironmentState"
+  grade:
+    signature: "grade() -> Dict[str, float]"
+http_endpoints:
+  - path: /reset
+    method: POST
+    description: Reset environment, returns EnvironmentState JSON
+  - path: /step
+    method: POST
+    description: Execute action, returns StepResult JSON
+  - path: /state
+    method: GET
+    description: Current state, returns EnvironmentState JSON
+  - path: /grade
+    method: POST
+    description: Grade current episode
+  - path: /health
+    method: GET
+    description: Health check
+dependencies:
+  python: ">=3.10"
+  packages:
+    - pydantic>=2.7
+    - fastapi>=0.111
+    - uvicorn>=0.29
+    - gradio>=4.36
+    - openai>=1.35
+    - pyyaml>=6.0
+docker:
+  port: 7860
+  health_check: /health

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+pydantic==2.11.1
+fastapi==0.115.12
+uvicorn==0.34.2
+gradio==5.23.3
+openai==1.75.0
+pyyaml==6.0.2
+httpx==0.28.1
+python-multipart==0.0.20
+openenv-core==0.1.0