Spaces:

ehsaaniqbal
/

invoiceops-env

Sleeping

+ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
+FROM ${BASE_IMAGE} AS builder
+WORKDIR /app
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends git && \
+    rm -rf /var/lib/apt/lists/*
+ARG BUILD_MODE=in-repo
+ARG ENV_NAME=invoiceops_env
+COPY . /app/env
+WORKDIR /app/env
+RUN if ! command -v uv >/dev/null 2>&1; then \
+        curl -LsSf https://astral.sh/uv/install.sh | sh && \
+        mv /root/.local/bin/uv /usr/local/bin/uv && \
+        mv /root/.local/bin/uvx /usr/local/bin/uvx; \
+    fi
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ -f uv.lock ]; then \
+        uv sync --frozen --no-install-project --no-editable; \
+    else \
+        uv sync --no-install-project --no-editable; \
+    fi
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ -f uv.lock ]; then \
+        uv sync --frozen --no-editable; \
+    else \
+        uv sync --no-editable; \
+    fi
+FROM ${BASE_IMAGE}
+WORKDIR /app
+COPY --from=builder /app/env/.venv /app/.venv
+COPY --from=builder /app/env /app/env
+ENV PATH="/app/.venv/bin:$PATH"
+ENV PYTHONPATH="/app/env:$PYTHONPATH"
+HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:8000/health || exit 1
+CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]

README.md ADDED Viewed

	@@ -0,0 +1,185 @@

+---
+title: InvoiceOps Environment Server
+emoji: 📄
+colorFrom: yellow
+colorTo: gray
+sdk: docker
+pinned: false
+app_port: 8000
+base_path: /web
+tags:
+  - openenv
+  - finance
+  - accounts-payable
+  - invoices
+---
+# InvoiceOps Environment
+Submitted by team: `Markov`
+`InvoiceOps` is a deterministic OpenEnv environment for [accounts payable (AP)](https://en.wikipedia.org/wiki/Accounts_payable) invoice exception handling. Each episode is one invoice case. The agent inspects surfaced exceptions, opens typed supporting artifacts, optionally runs duplicate checks, writes structured notes, saves line and header resolutions, and submits the case for deterministic grading.
+In real AP operations, this is the core decision problem: determine whether an invoice can be paid now, partially released, or routed for further review based on invoices, POs, receipts, approval status, and policy evidence.
+The workflow is loosely modeled on real enterprise AP controls used in systems such as [Microsoft Dynamics 365 Accounts payable](https://learn.microsoft.com/en-us/dynamics365/finance/accounts-payable/accounts-payable), including invoice review and approval, invoice matching, workflow routing, and partial payment handling.
+This environment is intentionally small and CPU-friendly, but it still measures real AP judgment:
+- evidence gathering before payment decisions
+- line-level vs header-level separation
+- duplicate-review strategy selection
+- receipt support judgment
+- partial release vs full hold
+- chronology-aware exception handling
+- routing to the correct follow-up owner when payment is not safe
+## Public Benchmark
+The public benchmark has four tasks. `easy` is a warm-up. `medium` and `medium_plus` test distinct mid-tier capabilities. `hard` is the composition case.
+| Task          | Core burden                                                                       | Best outcome                                           |
+| ------------- | --------------------------------------------------------------------------------- | ------------------------------------------------------ |
+| `easy`        | Start a missing approval workflow for a non-PO invoice                            | Hold and route to `requester`                          |
+| `medium`      | Clear a duplicate exception using the correct evidence path                       | Approve both lines and release payment                 |
+| `medium_plus` | Combine duplicate clearance with mixed line outcomes                              | Approve `L1`, hold `L2`, release approved lines        |
+| `hard`        | Combine duplicate review, invoice arithmetic, receipt chronology, and a tax block | Approve `L1` and `L3`, hold `L2`, hold header to `tax` |
+### Task Details
+#### `easy`
+Non-PO invoice with no initiated approval workflow. The invoice amount is within requester authority, so the correct action is to hold and route to `requester`.
+#### `medium`
+PO-backed invoice with a possible duplicate flag. The decisive evidence appears only after the normalized invoice number duplicate search. Approving safely requires the right duplicate path plus PO and receipt review.
+#### `medium_plus`
+PO-backed invoice with a possible duplicate flag and one short-received line above the de minimis threshold. The agent must clear the duplicate, separate line outcomes correctly, and use `release_approved_lines` instead of a blanket hold.
+#### `hard`
+Project invoice with interacting burdens: duplicate review, de minimis invoice arithmetic on `L1`, chronology-sensitive receipt support on `L2`, and a tax header block that routes to `tax`.
+## Action Space
+`InvoiceOpsAction` is a typed action model with these actions:
+- `open_artifact`
+- `inspect_exception`
+- `run_duplicate_check`
+- `add_note`
+- `set_line_resolution`
+- `set_header_resolution`
+- `submit_case`
+## Observation Space
+`InvoiceOpsObservation` includes:
+- queue-level case summary
+- available artifacts
+- most recently opened artifact
+- exception stubs and inspected exception details
+- duplicate candidates surfaced by the chosen strategy
+- saved notes
+- draft line and header resolutions
+- progress counters
+- final deterministic submission report after submit
+## Scoring
+The reward function provides dense trajectory signal for useful work such as first-time artifact opens, exception inspection, duplicate checks, notes, and valid saved resolutions. It penalizes invalid or redundant actions and inefficient trajectories.
+Final grading is deterministic and two-stage:
+1. Assign a `decision_band`: `best`, `safe_suboptimal`, `wrong`, or `unsafe`.
+2. Score within that band using core decision quality, timely evidence, structured documentation coverage, and efficiency.
+Important grading rule: best outcomes require the agent to uncover the required evidence before saving the decision. Conservative holds can still earn `safe_suboptimal` when the observed evidence justifies caution.
+## Design Choices
+This benchmark was iterated on, not created in one pass. We tried weaker task and grader shapes first, then removed designs that were easy to game or that clustered strong models for the wrong reasons.
+Key anti-gaming choices:
+- no pre-opened artifacts, auto-inspected exceptions, or auto-run duplicate checks
+- no hidden scenario-specific solver logic in the environment or grader
+- no prose grading; scores depend on typed actions, saved resolutions, observed evidence, and timing
+- fallback runs are zeroed in baseline mean scoring
+- conservative blanket holds are capped in `safe_suboptimal`; they do not earn `best`
+Main lessons from iteration:
+- making partial credit harsher did not improve the benchmark; harder tasks had to require better evidence use and better judgment
+- gating on restated citation strings was too brittle; grading now depends on evidence actually uncovered before the decision was saved
+## Local Setup
+```bash
+cd invoiceops_env
+uv sync --extra dev
+uv run pytest -q
+uv run server --port 8000
+```
+Run validation from the environment root:
+```bash
+openenv validate .
+openenv validate --url http://localhost:8000
+```
+If `openenv` is not installed in the current environment:
+```bash
+uvx --from openenv-core openenv validate .
+uvx --from openenv-core openenv validate --url http://localhost:8000
+```
+## Baseline
+The root [inference.py](./inference.py) script is the reproducible baseline.
+- OpenAI Python client
+- default `API_BASE_URL`: `https://router.huggingface.co/v1`
+- default `MODEL_NAME`: `zai-org/GLM-5.1`
+- fallback tasks are zeroed in `mean_score` by default while raw environment scores are still preserved
+- run artifacts are written under `outputs/evals/`
+Verified baseline on the current public benchmark:
+- model: `zai-org/GLM-5.1`
+- mean score: `0.6149`
+- task scores: `easy 0.9862`, `medium 0.9628`, `medium_plus 0.3130`, `hard 0.1975`
+Run it with:
+```bash
+cd invoiceops_env
+HF_TOKEN=... \
+API_BASE_URL=https://router.huggingface.co/v1 \
+uv run python inference.py
+```
+Optional environment variables:
+- `HF_TOKEN`
+- `API_BASE_URL`
+- `MODEL_NAME`
+- `ENV_URL`
+- `EVAL_RUN_NAME`
+- `MAX_TOKENS`
+- `RETRY_MAX_TOKENS`
+- `STRICT_BASELINE_SCORING`
+## Docker
+```bash
+cd invoiceops_env
+docker build -t invoiceops-env:latest .
+docker run -p 8000:8000 invoiceops-env:latest
+```

__init__.py ADDED Viewed

	@@ -0,0 +1,17 @@

+"""InvoiceOps environment package."""
+from invoiceops_env.client import InvoiceOpsEnv
+from invoiceops_env.models import (
+    InvoiceOpsAction,
+    InvoiceOpsObservation,
+    InvoiceOpsState,
+    TaskId,
+)
+__all__ = [
+    "InvoiceOpsAction",
+    "InvoiceOpsEnv",
+    "InvoiceOpsObservation",
+    "InvoiceOpsState",
+    "TaskId",
+]

batch ADDED Viewed

	@@ -0,0 +1,536 @@

+#!/usr/bin/env python3
+from __future__ import annotations
+import argparse
+import concurrent.futures
+import csv
+import json
+import os
+import re
+import subprocess
+import sys
+import time
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+from urllib.error import URLError
+from urllib.request import urlopen
+ROOT = Path(__file__).resolve().parent
+DEFAULT_API_BASE_URL = "https://router.huggingface.co/v1"
+DEFAULT_MODELS = [
+    "zai-org/GLM-5.1",
+    "openai/gpt-oss-120b",
+    "MiniMaxAI/MiniMax-M2.5",
+    "moonshotai/Kimi-K2.5",
+    # "google/gemma-4-31B-it",
+]
+TASK_COLUMNS = ["easy", "medium", "medium_plus", "hard"]
+@dataclass
+class ServerHandle:
+    port: int
+    process: subprocess.Popen[str] | None
+    log_path: Path
+    log_handle: Any | None
+    reused: bool = False
+    @property
+    def base_url(self) -> str:
+        return f"http://localhost:{self.port}"
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Run a batch of HF models against the local InvoiceOps environment."
+    )
+    parser.add_argument("--models", nargs="+", help="Override the default model list.")
+    parser.add_argument(
+        "--models-file",
+        help="Optional text file with one HF model id per line.",
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=8000,
+        help="Port for the local InvoiceOps server.",
+    )
+    parser.add_argument(
+        "--sync",
+        action="store_true",
+        help="Run `uv sync --extra dev` before starting.",
+    )
+    parser.add_argument(
+        "--validate",
+        action="store_true",
+        help="Run `openenv validate --url` before the sweep.",
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="Echo inference stderr while runs complete.",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Print the planned configuration without starting servers or calling models.",
+    )
+    parser.add_argument(
+        "--jobs",
+        type=int,
+        default=1,
+        help="Number of concurrent model runs.",
+    )
+    parser.add_argument(
+        "--reuse-running-server",
+        action="store_true",
+        help="Reuse an already-running server on the target port instead of failing fast.",
+    )
+    return parser.parse_args()
+def slugify(value: str) -> str:
+    slug = re.sub(r"[^A-Za-z0-9._-]+", "-", value.strip())
+    slug = slug.strip("-._")
+    return slug or "value"
+def load_models(args: argparse.Namespace) -> list[str]:
+    if args.models:
+        return args.models
+    if args.models_file:
+        path = Path(args.models_file).expanduser().resolve()
+        models = [
+            line.strip()
+            for line in path.read_text(encoding="utf-8").splitlines()
+            if line.strip() and not line.strip().startswith("#")
+        ]
+        if not models:
+            raise RuntimeError(f"No models found in {path}")
+        return models
+    return DEFAULT_MODELS
+def is_healthy(base_url: str, timeout_s: float = 1.0) -> bool:
+    try:
+        with urlopen(f"{base_url}/health", timeout=timeout_s) as response:
+            return response.status == 200
+    except URLError:
+        return False
+    except Exception:
+        return False
+def wait_for_health(base_url: str, timeout_s: float = 20.0) -> bool:
+    start = time.time()
+    while time.time() - start < timeout_s:
+        if is_healthy(base_url, timeout_s=1.0):
+            return True
+        time.sleep(0.5)
+    return False
+def start_server(
+    port: int,
+    batch_dir: Path,
+    *,
+    reuse_running_server: bool,
+) -> ServerHandle:
+    base_url = f"http://localhost:{port}"
+    if is_healthy(base_url):
+        if not reuse_running_server:
+            raise RuntimeError(
+                "A healthy server is already running at "
+                f"{base_url}. Stop it first or rerun with --reuse-running-server."
+            )
+        print(f"[batch] reusing running invoiceops_env at {base_url}", file=sys.stderr)
+        return ServerHandle(
+            port=port,
+            process=None,
+            log_path=batch_dir / "logs" / "invoiceops_env__server.log",
+            log_handle=None,
+            reused=True,
+        )
+    log_path = batch_dir / "logs" / "invoiceops_env__server.log"
+    log_handle = log_path.open("w", encoding="utf-8")
+    process = subprocess.Popen(
+        ["uv", "run", "server", "--port", str(port)],
+        cwd=ROOT,
+        stdout=log_handle,
+        stderr=subprocess.STDOUT,
+        text=True,
+    )
+    if not wait_for_health(base_url):
+        process.terminate()
+        try:
+            process.wait(timeout=5)
+        except subprocess.TimeoutExpired:
+            process.kill()
+        log_handle.close()
+        tail = log_path.read_text(encoding="utf-8", errors="replace")[-4000:]
+        raise RuntimeError(f"Failed to start invoiceops_env.\n{tail}")
+    print(f"[batch] started invoiceops_env at {base_url}", file=sys.stderr)
+    return ServerHandle(
+        port=port,
+        process=process,
+        log_path=log_path,
+        log_handle=log_handle,
+        reused=False,
+    )
+def stop_server(handle: ServerHandle) -> None:
+    if handle.process is not None:
+        handle.process.terminate()
+        try:
+            handle.process.wait(timeout=5)
+        except subprocess.TimeoutExpired:
+            handle.process.kill()
+            handle.process.wait(timeout=5)
+    if handle.log_handle is not None:
+        handle.log_handle.close()
+def validate_server(handle: ServerHandle) -> None:
+    subprocess.run(
+        [
+            "uvx",
+            "--from",
+            "openenv-core",
+            "openenv",
+            "validate",
+            "--url",
+            handle.base_url,
+        ],
+        cwd=ROOT,
+        check=True,
+    )
+def parse_output_path(stderr_text: str) -> Path | None:
+    for line in reversed(stderr_text.splitlines()):
+        if line.startswith("wrote="):
+            return Path(line.split("=", 1)[1].strip())
+    return None
+def make_batch_dir() -> Path:
+    batch_id = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
+    batch_dir = ROOT / "batch_runs" / batch_id
+    (batch_dir / "logs").mkdir(parents=True, exist_ok=True)
+    return batch_dir
+def _collect_request_errors(node: Any) -> list[str]:
+    errors: list[str] = []
+    if isinstance(node, dict):
+        if node.get("failure_reason") == "request_error":
+            message = node.get("error_message")
+            if isinstance(message, str) and message.strip():
+                errors.append(message.strip())
+        for value in node.values():
+            errors.extend(_collect_request_errors(value))
+    elif isinstance(node, list):
+        for value in node:
+            errors.extend(_collect_request_errors(value))
+    return errors
+def classify_status(
+    *,
+    returncode: int,
+    payload: dict[str, Any] | None,
+    request_errors: list[str],
+) -> str:
+    if returncode != 0 or payload is None:
+        return "failed"
+    if not request_errors:
+        return "ok"
+    joined = "\n".join(request_errors).lower()
+    if "model_not_supported" in joined or "not a chat model" in joined:
+        return "invalid_model"
+    if "depleted your monthly included credits" in joined:
+        return "provider_credit_error"
+    return "request_error"
+def extract_scores(payload: dict[str, Any]) -> tuple[dict[str, float], int, int]:
+    results = payload.get("results") or []
+    scores: dict[str, float] = {}
+    fallback_count = 0
+    parse_failure_count = 0
+    for result in results:
+        task_id = result.get("task_id")
+        score = result.get("score")
+        if isinstance(task_id, str) and isinstance(score, (int, float)):
+            scores[task_id] = float(score)
+        if result.get("used_fallback") is True:
+            fallback_count += 1
+        if result.get("decision_parsed") is False:
+            parse_failure_count += 1
+    return scores, fallback_count, parse_failure_count
+def run_inference(
+    handle: ServerHandle,
+    *,
+    model_name: str,
+    hf_token: str,
+    api_base_url: str,
+    batch_name: str,
+    logs_dir: Path,
+    verbose: bool,
+) -> dict[str, Any]:
+    model_slug = slugify(model_name)
+    stdout_path = logs_dir / f"invoiceops_env__{model_slug}.stdout.log"
+    stderr_path = logs_dir / f"invoiceops_env__{model_slug}.stderr.log"
+    env = os.environ.copy()
+    env.update(
+        {
+            "HF_TOKEN": hf_token,
+            "API_BASE_URL": api_base_url,
+            "MODEL_NAME": model_name,
+            "ENV_URL": handle.base_url,
+            "EVAL_RUN_NAME": batch_name,
+        }
+    )
+    started_at = time.time()
+    result = subprocess.run(
+        ["uv", "run", "python", "inference.py"],
+        cwd=ROOT,
+        env=env,
+        capture_output=True,
+        text=True,
+        check=False,
+    )
+    duration_s = round(time.time() - started_at, 2)
+    stdout_path.write_text(result.stdout, encoding="utf-8")
+    stderr_path.write_text(result.stderr, encoding="utf-8")
+    if verbose and result.stderr.strip():
+        sys.stderr.write(result.stderr)
+        if not result.stderr.endswith("\n"):
+            sys.stderr.write("\n")
+    output_path = parse_output_path(result.stderr)
+    payload: dict[str, Any] | None = None
+    if output_path is not None and output_path.exists():
+        payload = json.loads(output_path.read_text(encoding="utf-8"))
+    scores: dict[str, float] = {}
+    fallback_count = 0
+    parse_failure_count = 0
+    mean_score = None
+    request_errors: list[str] = []
+    if payload is not None:
+        if isinstance(payload.get("mean_score"), (int, float)):
+            mean_score = float(payload["mean_score"])
+        elif isinstance(payload.get("raw_mean_score"), (int, float)):
+            mean_score = float(payload["raw_mean_score"])
+        scores, fallback_count, parse_failure_count = extract_scores(payload)
+        request_errors = _collect_request_errors(payload)
+    status = classify_status(
+        returncode=result.returncode,
+        payload=payload,
+        request_errors=request_errors,
+    )
+    return {
+        "model": model_name,
+        "status": status,
+        "returncode": result.returncode,
+        "duration_s": duration_s,
+        "mean_score": mean_score,
+        "fallback_count": fallback_count,
+        "parse_failure_count": parse_failure_count,
+        "request_error_count": len(request_errors),
+        "first_request_error": request_errors[0] if request_errors else "",
+        "output_json": str(output_path) if output_path is not None else "",
+        "stdout_log": str(stdout_path),
+        "stderr_log": str(stderr_path),
+        **{task_id: scores.get(task_id) for task_id in TASK_COLUMNS},
+    }
+def print_summary(rows: list[dict[str, Any]]) -> None:
+    headers = [
+        "model",
+        "mean",
+        *TASK_COLUMNS,
+        "fallbacks",
+        "parse_fail",
+        "req_err",
+        "status",
+        "sec",
+    ]
+    widths = {header: len(header) for header in headers}
+    rendered_rows: list[dict[str, str]] = []
+    for row in rows:
+        rendered = {
+            "model": row["model"],
+            "mean": "-" if row["mean_score"] is None else f"{row['mean_score']:.4f}",
+            "fallbacks": str(row["fallback_count"]),
+            "parse_fail": str(row["parse_failure_count"]),
+            "req_err": str(row["request_error_count"]),
+            "status": row["status"],
+            "sec": f"{row['duration_s']:.1f}",
+        }
+        rendered.update(
+            {
+                task_id: "-" if row.get(task_id) is None else f"{row[task_id]:.4f}"
+                for task_id in TASK_COLUMNS
+            }
+        )
+        rendered_rows.append(rendered)
+        for key, value in rendered.items():
+            widths[key] = max(widths[key], len(value))
+    print("  ".join(header.ljust(widths[header]) for header in headers))
+    print("  ".join("-" * widths[header] for header in headers))
+    for row in rendered_rows:
+        print("  ".join(row[header].ljust(widths[header]) for header in headers))
+def write_summary_files(
+    batch_dir: Path, rows: list[dict[str, Any]]
+) -> tuple[Path, Path]:
+    csv_path = batch_dir / "summary.csv"
+    json_path = batch_dir / "summary.json"
+    fieldnames = [
+        "model",
+        "mean_score",
+        *TASK_COLUMNS,
+        "fallback_count",
+        "parse_failure_count",
+        "request_error_count",
+        "status",
+        "duration_s",
+        "returncode",
+        "first_request_error",
+        "output_json",
+        "stdout_log",
+        "stderr_log",
+    ]
+    with csv_path.open("w", encoding="utf-8", newline="") as handle:
+        writer = csv.DictWriter(handle, fieldnames=fieldnames if rows else ["model"])
+        writer.writeheader()
+        writer.writerows(rows)
+    json_path.write_text(json.dumps(rows, indent=2), encoding="utf-8")
+    return csv_path, json_path
+def main() -> int:
+    args = parse_args()
+    if args.jobs < 1:
+        raise RuntimeError("--jobs must be at least 1.")
+    models = load_models(args)
+    api_base_url = os.getenv("API_BASE_URL", DEFAULT_API_BASE_URL)
+    hf_token = os.getenv("HF_TOKEN")
+    if not hf_token and not args.dry_run:
+        raise RuntimeError("Set HF_TOKEN in the shell before running ./batch.")
+    if args.dry_run:
+        print("Dry run only.")
+        print(f"API_BASE_URL={api_base_url}")
+        print(f"models={','.join(models)}")
+        print(f"jobs={args.jobs}")
+        print(f"invoiceops_env -> http://localhost:{args.port}")
+        return 0
+    batch_dir = make_batch_dir()
+    batch_name = batch_dir.name
+    logs_dir = batch_dir / "logs"
+    rows: list[dict[str, Any]] = []
+    handle: ServerHandle | None = None
+    try:
+        if args.sync:
+            subprocess.run(["uv", "sync", "--extra", "dev"], cwd=ROOT, check=True)
+        handle = start_server(
+            args.port,
+            batch_dir,
+            reuse_running_server=args.reuse_running_server,
+        )
+        if args.validate:
+            validate_server(handle)
+        print(f"[batch] batch={batch_name}", file=sys.stderr)
+        print(f"[batch] api_base_url={api_base_url}", file=sys.stderr)
+        if args.jobs == 1:
+            for model_name in models:
+                print(
+                    f"[batch] running invoiceops_env :: {model_name}", file=sys.stderr
+                )
+                row = run_inference(
+                    handle,
+                    model_name=model_name,
+                    hf_token=hf_token,
+                    api_base_url=api_base_url,
+                    batch_name=batch_name,
+                    logs_dir=logs_dir,
+                    verbose=args.verbose,
+                )
+                rows.append(row)
+                mean_display = (
+                    "-" if row["mean_score"] is None else f"{row['mean_score']:.4f}"
+                )
+                print(
+                    f"[batch] result invoiceops_env :: {model_name} mean={mean_display} status={row['status']}",
+                    file=sys.stderr,
+                )
+        else:
+            with concurrent.futures.ThreadPoolExecutor(
+                max_workers=args.jobs
+            ) as executor:
+                futures = {
+                    executor.submit(
+                        run_inference,
+                        handle,
+                        model_name=model_name,
+                        hf_token=hf_token,
+                        api_base_url=api_base_url,
+                        batch_name=batch_name,
+                        logs_dir=logs_dir,
+                        verbose=args.verbose,
+                    ): model_name
+                    for model_name in models
+                }
+                for future in concurrent.futures.as_completed(futures):
+                    model_name = futures[future]
+                    row = future.result()
+                    rows.append(row)
+                    mean_display = (
+                        "-" if row["mean_score"] is None else f"{row['mean_score']:.4f}"
+                    )
+                    print(
+                        f"[batch] result invoiceops_env :: {model_name} mean={mean_display} status={row['status']}",
+                        file=sys.stderr,
+                    )
+            order = {model: index for index, model in enumerate(models)}
+            rows.sort(key=lambda row: order[row["model"]])
+        csv_path, json_path = write_summary_files(batch_dir, rows)
+        print_summary(rows)
+        print(f"\nsummary_csv={csv_path}")
+        print(f"summary_json={json_path}")
+        print(f"logs_dir={logs_dir}")
+        return 0
+    finally:
+        if handle is not None:
+            stop_server(handle)
+if __name__ == "__main__":
+    raise SystemExit(main())

client.py ADDED Viewed

	@@ -0,0 +1,39 @@

+"""Client for the InvoiceOps environment."""
+from __future__ import annotations
+from typing import Any
+from openenv.core import EnvClient
+from openenv.core.client_types import StepResult
+from invoiceops_env.models import (
+    InvoiceOpsAction,
+    InvoiceOpsObservation,
+    InvoiceOpsState,
+)
+class InvoiceOpsEnv(EnvClient[InvoiceOpsAction, InvoiceOpsObservation, InvoiceOpsState]):
+    """WebSocket client for persistent InvoiceOps sessions."""
+    def _step_payload(self, action: InvoiceOpsAction) -> dict[str, Any]:
+        return action.model_dump(exclude_none=True)
+    def _parse_result(self, payload: dict[str, Any]) -> StepResult[InvoiceOpsObservation]:
+        obs_data = payload.get("observation", {})
+        observation = InvoiceOpsObservation.model_validate(
+            {
+                **obs_data,
+                "done": payload.get("done", False),
+                "reward": payload.get("reward"),
+            }
+        )
+        return StepResult(
+            observation=observation,
+            reward=payload.get("reward"),
+            done=payload.get("done", False),
+        )
+    def _parse_state(self, payload: dict[str, Any]) -> InvoiceOpsState:
+        return InvoiceOpsState.model_validate(payload)

data/scenarios/easy.json ADDED Viewed

	@@ -0,0 +1,167 @@

+{
+  "scenario_id": "easy",
+  "task_id": "easy",
+  "case_id": "CASE-EASY-001",
+  "title": "Non-PO invoice with unstarted approval workflow",
+  "description": "Review a non-PO services invoice with an open approval control. Determine the correct requester-tier routing and payment recommendation from the workflow state and policy.",
+  "step_limit": 10,
+  "queue_card": {
+    "case_id": "CASE-EASY-001",
+    "vendor_name": "Orion Advisory Partners",
+    "vendor_id": "V-741",
+    "invoice_number": "OA-4401",
+    "invoice_date": "2026-03-20",
+    "invoice_total": 8500.0,
+    "currency": "USD",
+    "po_number": null,
+    "risk_flags": ["non_po_invoice", "missing_approval"],
+    "summary": "Non-PO services invoice with an open approval control."
+  },
+  "artifacts": [
+    {
+      "artifact_id": "art-invoice",
+      "artifact_type": "invoice_packet",
+      "title": "Invoice packet OA-4401",
+      "summary": "Single-line non-PO advisory invoice.",
+      "fields": [
+        {"label": "Vendor", "value": "Orion Advisory Partners"},
+        {"label": "Invoice number", "value": "OA-4401"},
+        {"label": "Invoice date", "value": "2026-03-20"},
+        {"label": "Invoice type", "value": "Non-PO services"},
+        {"label": "Cost center", "value": "EXEC-110"},
+        {"label": "Gross total", "value": "8500.00 USD"}
+      ],
+      "line_items": [
+        {
+          "line_id": "L1",
+          "description": "Q1 strategic advisory engagement package",
+          "quantity": 1.0,
+          "unit_price": 8500.0,
+          "amount": 8500.0,
+          "status": "invoiced",
+          "notes": "Cost center EXEC-110"
+        }
+      ],
+      "events": [
+        {
+          "event_id": "evt-received",
+          "event_type": "invoice_received",
+          "event_date": "2026-03-21",
+          "description": "Invoice packet received through AP inbox",
+          "quantity": null,
+          "amount": 8500.0,
+          "status": "queued"
+        }
+      ],
+      "related_refs": ["art-approval", "art-policy", "EX-NONPO-APPROVAL"]
+    },
+    {
+      "artifact_id": "art-approval",
+      "artifact_type": "approval_artifact",
+      "title": "Approval trail for OA-4401",
+      "summary": "Approval workflow has not been started.",
+      "fields": [
+        {"label": "Workflow status", "value": "Not initiated"},
+        {"label": "Requester", "value": "Jordan Kim"},
+        {"label": "Requester authority", "value": "Up to 10000.00 USD"},
+        {"label": "Submitted for approval", "value": "No"}
+      ],
+      "line_items": [],
+      "events": [],
+      "related_refs": ["art-policy", "EX-NONPO-APPROVAL"]
+    },
+    {
+      "artifact_id": "art-vendor",
+      "artifact_type": "vendor_master",
+      "title": "Vendor master: Orion Advisory Partners",
+      "summary": "Active professional-services vendor.",
+      "fields": [
+        {"label": "Vendor ID", "value": "V-741"},
+        {"label": "Payment terms", "value": "Net 30"},
+        {"label": "Vendor status", "value": "Active"},
+        {"label": "Blanket PO authorization", "value": "None"}
+      ],
+      "line_items": [],
+      "events": [],
+      "related_refs": ["art-invoice"]
+    },
+    {
+      "artifact_id": "art-policy",
+      "artifact_type": "policy_card",
+      "title": "AP policy card",
+      "summary": "Non-PO authorization and routing thresholds.",
+      "fields": [
+        {"label": "Non-PO authorization rule", "value": "Non-PO invoices require completed authorization before any payment release."},
+        {"label": "Requester authority limit", "value": "Business requester may authorize non-PO spend up to 10000.00 USD."},
+        {"label": "AP Manager authority", "value": "Amounts above requester authority require AP Manager authorization before release."},
+        {"label": "Unstarted workflow handling", "value": "If no approval workflow exists, place the invoice on hold and route it to the requester to initiate approval when the amount is within requester authority."}
+      ],
+      "line_items": [],
+      "events": [],
+      "related_refs": ["EX-NONPO-APPROVAL"]
+    }
+  ],
+  "exceptions": [
+    {
+      "exception_id": "EX-NONPO-APPROVAL",
+      "exception_type": "non_po_missing_approval",
+      "severity": "high",
+      "headline": "Authorization control is open for this non-PO invoice",
+      "impacted_line_ids": ["L1"],
+      "short_description": "The invoice has not completed required authorization.",
+      "fields": [
+        {"label": "Workflow status", "value": "Not initiated"},
+        {"label": "Invoice total", "value": "8500.00 USD"},
+        {"label": "Authorization status", "value": "Incomplete"}
+      ],
+      "reviewer_guidance": "Review the approval trail and policy before deciding."
+    }
+  ],
+  "duplicate_candidates": [],
+  "hidden_truth": {
+    "line_expectations": {
+      "L1": {
+        "amount": 8500.0,
+        "score_map": {
+          "hold": 1.0,
+          "escalate": 0.65,
+          "reject": 0.15,
+          "approve": 0.0
+        },
+        "accepted_reason_sets": [
+          ["non_po_approval_missing"]
+        ],
+        "accepted_routes": ["requester"],
+        "gating_refs": [],
+        "decisive_refs": ["art-invoice", "art-approval", "art-policy", "EX-NONPO-APPROVAL"],
+        "unsafe_approve": true
+      }
+    },
+    "header_expectation": {
+      "score_map": {
+        "hold_full_invoice": 1.0,
+        "escalate_case": 0.75,
+        "reject_full_invoice": 0.15,
+        "release_approved_lines": 0.0
+      },
+      "accepted_reason_sets": [
+        ["non_po_approval_missing"]
+      ],
+      "accepted_routes": ["requester"],
+      "gating_refs": [],
+      "decisive_refs": ["art-approval", "art-policy", "EX-NONPO-APPROVAL"],
+      "unsafe_recommendations": ["release_approved_lines"],
+      "overconservative_recommendations": []
+    },
+    "note_expectations": [
+      {
+        "issue_id": "non_po_workflow_missing",
+        "accepted_reason_sets": [
+          ["non_po_approval_missing"]
+        ],
+        "decisive_refs": ["art-approval", "art-policy"]
+      }
+    ],
+    "efficient_step_target": 8
+  }
+}

data/scenarios/hard.json ADDED Viewed

	@@ -0,0 +1,537 @@

+{
+  "scenario_id": "hard",
+  "task_id": "hard",
+  "case_id": "CASE-HARD-001",
+  "title": "Project invoice with mixed support, duplicate review, and tax block",
+  "description": "Review a project equipment invoice with duplicate, receiving, and tax controls. Resolve each line and the case header from the evidence you gather.",
+  "step_limit": 24,
+  "queue_card": {
+    "case_id": "CASE-HARD-001",
+    "vendor_name": "Northshore Controls",
+    "vendor_id": "V-229",
+    "invoice_number": "NC-8831/2",
+    "invoice_date": "2026-03-29",
+    "invoice_total": 12936.3,
+    "currency": "USD",
+    "po_number": "PO-44019",
+    "risk_flags": [
+      "po_invoice",
+      "possible_duplicate",
+      "receipt_variance",
+      "partial_receipt",
+      "tax_variance"
+    ],
+    "summary": "Project equipment invoice with duplicate, receipt, and tax controls open."
+  },
+  "artifacts": [
+    {
+      "artifact_id": "art-invoice",
+      "artifact_type": "invoice_packet",
+      "title": "Invoice packet NC-8831/2",
+      "summary": "Three-line project equipment invoice with billed sales tax and a slash-suffixed invoice number.",
+      "fields": [
+        {"label": "Vendor", "value": "Northshore Controls"},
+        {"label": "Invoice number", "value": "NC-8831/2"},
+        {"label": "Invoice date", "value": "2026-03-29"},
+        {"label": "PO number", "value": "PO-44019"},
+        {"label": "Project code", "value": "GREEN-440"},
+        {"label": "Subtotal", "value": "12090.00 USD"},
+        {"label": "Tax", "value": "846.30 USD"},
+        {"label": "Gross total", "value": "12936.30 USD"}
+      ],
+      "line_items": [
+        {
+          "line_id": "L1",
+          "description": "Calibration harness assemblies",
+          "quantity": 42.0,
+          "unit_price": 95.0,
+          "amount": 3990.0,
+          "status": "invoiced",
+          "notes": "PO line 10"
+        },
+        {
+          "line_id": "L2",
+          "description": "Field junction boxes",
+          "quantity": 18.0,
+          "unit_price": 190.0,
+          "amount": 3420.0,
+          "status": "invoiced",
+          "notes": "PO line 20"
+        },
+        {
+          "line_id": "L3",
+          "description": "Sensor mounting rails",
+          "quantity": 36.0,
+          "unit_price": 130.0,
+          "amount": 4680.0,
+          "status": "invoiced",
+          "notes": "PO line 30"
+        }
+      ],
+      "events": [
+        {
+          "event_id": "evt-received",
+          "event_type": "invoice_received",
+          "event_date": "2026-03-30",
+          "description": "AP queue received invoice packet through EDI",
+          "quantity": null,
+          "amount": 12936.3,
+          "status": "queued"
+        }
+      ],
+      "related_refs": [
+        "art-po",
+        "art-receipts",
+        "art-history",
+        "art-vendor",
+        "art-policy",
+        "EX-POSSIBLE-DUP",
+        "EX-RECEIPT-L1",
+        "EX-RECEIPT-L2",
+        "EX-TAX-001"
+      ]
+    },
+    {
+      "artifact_id": "art-po",
+      "artifact_type": "purchase_order",
+      "title": "PO-44019",
+      "summary": "Project equipment order for GREEN-440.",
+      "fields": [
+        {"label": "Buyer", "value": "Project Procurement"},
+        {"label": "PO number", "value": "PO-44019"},
+        {"label": "Supplier", "value": "Northshore Controls"},
+        {"label": "Project code", "value": "GREEN-440"},
+        {"label": "Tax handling", "value": "GREEN-440 exemption certificate applies; consult AP tax rules when billed tax appears."}
+      ],
+      "line_items": [
+        {
+          "line_id": "L1",
+          "description": "Calibration harness assemblies",
+          "quantity": 42.0,
+          "unit_price": 95.0,
+          "amount": 3990.0,
+          "status": "ordered",
+          "notes": "PO line 10"
+        },
+        {
+          "line_id": "L2",
+          "description": "Field junction boxes",
+          "quantity": 18.0,
+          "unit_price": 190.0,
+          "amount": 3420.0,
+          "status": "ordered",
+          "notes": "PO line 20"
+        },
+        {
+          "line_id": "L3",
+          "description": "Sensor mounting rails",
+          "quantity": 36.0,
+          "unit_price": 130.0,
+          "amount": 4680.0,
+          "status": "ordered",
+          "notes": "PO line 30"
+        }
+      ],
+      "events": [],
+      "related_refs": ["art-invoice", "art-receipts", "art-vendor"]
+    },
+    {
+      "artifact_id": "art-receipts",
+      "artifact_type": "receipt_log",
+      "title": "Receipt log for PO-44019",
+      "summary": "One line is short, one line is under later receiving review, and one line is fully received.",
+      "fields": [
+        {"label": "Receiving site", "value": "GREEN-440 project warehouse"},
+        {"label": "Last receipt update", "value": "2026-03-30"},
+        {"label": "Open receipt issue", "value": "PO line 20 needs receiving follow-up before support is final"}
+      ],
+      "line_items": [
+        {
+          "line_id": "L1",
+          "description": "Calibration harness assemblies",
+          "quantity": 41.0,
+          "unit_price": null,
+          "amount": null,
+          "status": "short_received",
+          "notes": "41 of 42 units posted on 2026-03-27"
+        },
+        {
+          "line_id": "L2",
+          "description": "Field junction boxes",
+          "quantity": 18.0,
+          "unit_price": null,
+          "amount": null,
+          "status": "received_under_review",
+          "notes": "Initial receipt posted on 2026-03-26; see receiving history for current support"
+        },
+        {
+          "line_id": "L3",
+          "description": "Sensor mounting rails",
+          "quantity": 36.0,
+          "unit_price": null,
+          "amount": null,
+          "status": "fully_received",
+          "notes": "Received in full on 2026-03-28"
+        }
+      ],
+      "events": [
+        {
+          "event_id": "evt-rcv-l1",
+          "event_type": "goods_receipt",
+          "event_date": "2026-03-27",
+          "description": "Received 41 calibration harness assemblies",
+          "quantity": 41.0,
+          "amount": null,
+          "status": "posted"
+        },
+        {
+          "event_id": "evt-rcv-l2-initial",
+          "event_type": "goods_receipt",
+          "event_date": "2026-03-26",
+          "description": "Received 18 field junction boxes",
+          "quantity": 18.0,
+          "amount": null,
+          "status": "initially_posted"
+        },
+        {
+          "event_id": "evt-rcv-l2-review",
+          "event_type": "receiving_review",
+          "event_date": "2026-03-30",
+          "description": "Receiving posted a follow-up control update for 18 field junction boxes after damage inspection",
+          "quantity": null,
+          "amount": null,
+          "status": "review_open"
+        },
+        {
+          "event_id": "evt-rcv-l3",
+          "event_type": "goods_receipt",
+          "event_date": "2026-03-28",
+          "description": "Received 36 sensor mounting rails",
+          "quantity": 36.0,
+          "amount": null,
+          "status": "posted"
+        }
+      ],
+      "related_refs": ["art-po", "art-history"]
+    },
+    {
+      "artifact_id": "art-history",
+      "artifact_type": "invoice_history",
+      "title": "Receiving and invoice history for NC-8831/2",
+      "summary": "History shows a reversed prior AP import and an open later receiving hold on PO line 20.",
+      "fields": [
+        {"label": "Prior AP duplicate status", "value": "Same normalized invoice number was reversed before payment after EDI retry"},
+        {"label": "Latest receiving control", "value": "Damage hold case RCV-1187 is open on PO line 20 as of 2026-03-30"},
+        {"label": "Replacement ETA", "value": "Pending vendor reship confirmation"}
+      ],
+      "line_items": [],
+      "events": [
+        {
+          "event_id": "evt-dup-reversal",
+          "event_type": "invoice_reversal",
+          "event_date": "2026-03-18",
+          "description": "Prior AP record NC88312 reversed before payment after duplicate EDI import",
+          "quantity": null,
+          "amount": 12936.3,
+          "status": "closed"
+        },
+        {
+          "event_id": "evt-receiving-hold",
+          "event_type": "receiving_hold",
+          "event_date": "2026-03-30",
+          "description": "Receiving opened a damage hold on PO line 20 after inspection; replacement disposition pending",
+          "quantity": 18.0,
+          "amount": 3420.0,
+          "status": "open"
+        }
+      ],
+      "related_refs": ["EX-POSSIBLE-DUP", "EX-RECEIPT-L2", "art-receipts"]
+    },
+    {
+      "artifact_id": "art-vendor",
+      "artifact_type": "vendor_master",
+      "title": "Vendor master: Northshore Controls",
+      "summary": "Active vendor with an active GREEN-440 project exemption profile.",
+      "fields": [
+        {"label": "Vendor ID", "value": "V-229"},
+        {"label": "Vendor status", "value": "Active"},
+        {"label": "Project exemption profile", "value": "GREEN-440 exemption certificate EX-118 active through 2026-12-31"},
+        {"label": "Tax note", "value": "Exemption certificate is on file for GREEN-440; consult AP tax handling rules when billed tax appears"}
+      ],
+      "line_items": [],
+      "events": [],
+      "related_refs": ["art-invoice", "art-po", "EX-TAX-001"]
+    },
+    {
+      "artifact_id": "art-policy",
+      "artifact_type": "policy_card",
+      "title": "AP policy card",
+      "summary": "Duplicate, receipt, chronology, and tax handling rules for project invoices.",
+      "fields": [
+        {"label": "Duplicate review rule", "value": "When possible_duplicate is flagged on format variants such as slash or punctuation changes, review a normalized invoice number match before relying on heuristic amount/date similarity."},
+        {"label": "Reversed duplicate rule", "value": "A prior AP record reversed or voided before payment is not a payment block."},
+        {"label": "De minimis receipt shortage", "value": "A line may release when unsupported amount is 150.00 USD or less and no later receiving reversal or hold remains."},
+        {"label": "Receipt chronology", "value": "The latest receiving control event supersedes earlier posted receipt support. If current support is still under review, route the line to Receiving instead of approving it."},
+        {"label": "Tax dispute rule", "value": "If billed tax conflicts with active exempt project status, hold the full invoice and route the case to tax even when supported goods lines are approved."}
+      ],
+      "line_items": [],
+      "events": [],
+      "related_refs": ["EX-POSSIBLE-DUP", "EX-RECEIPT-L1", "EX-RECEIPT-L2", "EX-TAX-001"]
+    }
+  ],
+  "exceptions": [
+    {
+      "exception_id": "EX-POSSIBLE-DUP",
+      "exception_type": "possible_duplicate",
+      "severity": "high",
+      "headline": "Duplicate control is open for this invoice",
+      "impacted_line_ids": ["L1", "L2", "L3"],
+      "short_description": "A prior AP record may overlap with this invoice.",
+      "fields": [
+        {"label": "Invoice number", "value": "NC-8831/2"},
+        {"label": "Vendor", "value": "Northshore Controls"},
+        {"label": "Control status", "value": "Duplicate review required before release"}
+      ],
+      "reviewer_guidance": "Run the relevant duplicate search and review candidate status before deciding."
+    },
+    {
+      "exception_id": "EX-RECEIPT-L1",
+      "exception_type": "receipt_quantity_variance",
+      "severity": "medium",
+      "headline": "Receipt support is short on L1",
+      "impacted_line_ids": ["L1"],
+      "short_description": "Received quantity on L1 is below the invoiced quantity.",
+      "fields": [
+        {"label": "Invoice quantity", "value": "42"},
+        {"label": "Received quantity", "value": "41"},
+        {"label": "Short quantity", "value": "1"}
+      ],
+      "reviewer_guidance": "Review the invoice unit rate, receipt details, and shortage rule before deciding."
+    },
+    {
+      "exception_id": "EX-RECEIPT-L2",
+      "exception_type": "receipt_quantity_variance",
+      "severity": "high",
+      "headline": "Receipt support changed after initial posting on L2",
+      "impacted_line_ids": ["L2"],
+      "short_description": "The initial receipt support on L2 may no longer be current.",
+      "fields": [
+        {"label": "Invoice quantity", "value": "18"},
+        {"label": "Initial posted receipt", "value": "18 units on 2026-03-26"},
+        {"label": "Latest control update", "value": "Receiving review posted on 2026-03-30"}
+      ],
+      "reviewer_guidance": "Review the receipt log and receiving history before deciding whether support is still current."
+    },
+    {
+      "exception_id": "EX-TAX-001",
+      "exception_type": "tax_variance",
+      "severity": "high",
+      "headline": "Tax control is open for this invoice",
+      "impacted_line_ids": ["L1", "L2", "L3"],
+      "short_description": "Billed tax may conflict with the expected project tax treatment.",
+      "fields": [
+        {"label": "Project code", "value": "GREEN-440"},
+        {"label": "Invoice taxable basis", "value": "12090.00 USD"},
+        {"label": "Billed tax", "value": "846.30 USD"},
+        {"label": "Jurisdiction", "value": "Washington"}
+      ],
+      "reviewer_guidance": "Review the vendor tax profile and policy before deciding what can be released."
+    }
+  ],
+  "duplicate_candidates": [
+    {
+      "candidate_id": "CAND-NORM-01",
+      "vendor_name": "Northshore Controls",
+      "invoice_number": "NC88312",
+      "invoice_date": "2026-03-18",
+      "gross_amount": 12936.3,
+      "status": "reversed on 2026-03-18 before payment after EDI retry; closed",
+      "match_basis": "Normalized invoice number + vendor + gross amount",
+      "overlap_summary": "Same normalized invoice number. Prior AP record was reversed before payment.",
+      "supported_match_strategies": ["normalized_invoice_no"]
+    },
+    {
+      "candidate_id": "CAND-AMT-02",
+      "vendor_name": "Northshore Controls",
+      "invoice_number": "NC-8807",
+      "invoice_date": "2026-03-29",
+      "gross_amount": 12936.3,
+      "status": "open",
+      "match_basis": "Vendor + gross amount + nearby invoice date",
+      "overlap_summary": "Same amount and nearby date, but invoice number and project context differ.",
+      "supported_match_strategies": ["vendor_amount_date"]
+    }
+  ],
+  "hidden_truth": {
+    "line_expectations": {
+      "L1": {
+        "amount": 3990.0,
+        "score_map": {
+          "approve": 1.0,
+          "hold": 0.25,
+          "escalate": 0.1,
+          "reject": 0.0
+        },
+        "accepted_reason_sets": [
+          ["partial_receipt_pending", "safe_to_pay"],
+          ["safe_to_pay"],
+          ["partial_receipt_pending"]
+        ],
+        "accepted_routes": [],
+        "gating_refs": [
+          "art-invoice",
+          "art-receipts",
+          "duplicate_check:normalized_invoice_no",
+          "CAND-NORM-01",
+          "EX-RECEIPT-L1",
+          "art-policy"
+        ],
+        "safe_gating_refs": [
+          "art-receipts",
+          "duplicate_check:normalized_invoice_no",
+          "CAND-NORM-01",
+          "EX-RECEIPT-L1"
+        ],
+        "decisive_refs": [
+          "art-invoice",
+          "art-receipts",
+          "EX-RECEIPT-L1",
+          "art-policy",
+          "duplicate_check:normalized_invoice_no",
+          "CAND-NORM-01"
+        ],
+        "unsafe_approve": false
+      },
+      "L2": {
+        "amount": 3420.0,
+        "score_map": {
+          "hold": 1.0,
+          "escalate": 0.45,
+          "reject": 0.1,
+          "approve": 0.0
+        },
+        "accepted_reason_sets": [
+          ["receipt_not_confirmed"],
+          ["receipt_not_confirmed", "partial_receipt_pending"]
+        ],
+        "accepted_routes": ["receiving"],
+        "gating_refs": [
+          "art-receipts",
+          "art-history",
+          "EX-RECEIPT-L2",
+          "art-policy"
+        ],
+        "safe_gating_refs": [
+          "art-receipts",
+          "EX-RECEIPT-L2"
+        ],
+        "decisive_refs": [
+          "art-receipts",
+          "art-history",
+          "EX-RECEIPT-L2",
+          "art-policy"
+        ],
+        "unsafe_approve": true
+      },
+      "L3": {
+        "amount": 4680.0,
+        "score_map": {
+          "approve": 1.0,
+          "hold": 0.1,
+          "escalate": 0.05,
+          "reject": 0.0
+        },
+        "accepted_reason_sets": [
+          ["matched_to_po_and_receipt", "safe_to_pay"],
+          ["safe_to_pay"],
+          ["possible_duplicate_review"]
+        ],
+        "accepted_routes": [],
+        "gating_refs": [
+          "art-invoice",
+          "art-receipts",
+          "duplicate_check:normalized_invoice_no",
+          "CAND-NORM-01"
+        ],
+        "safe_gating_refs": [
+          "art-receipts",
+          "duplicate_check:normalized_invoice_no",
+          "CAND-NORM-01"
+        ],
+        "decisive_refs": [
+          "art-invoice",
+          "art-receipts",
+          "duplicate_check:normalized_invoice_no",
+          "CAND-NORM-01"
+        ],
+        "unsafe_approve": false
+      }
+    },
+    "header_expectation": {
+      "score_map": {
+        "hold_full_invoice": 1.0,
+        "escalate_case": 0.55,
+        "reject_full_invoice": 0.05,
+        "release_approved_lines": 0.0
+      },
+      "accepted_reason_sets": [
+        ["tax_amount_mismatch"],
+        ["tax_amount_mismatch", "receipt_not_confirmed"],
+        ["tax_amount_mismatch", "safe_to_pay"]
+      ],
+      "accepted_routes": ["tax"],
+      "gating_refs": [
+        "art-vendor",
+        "art-policy",
+        "EX-TAX-001"
+      ],
+      "safe_gating_refs": [
+        "art-vendor",
+        "art-policy",
+        "EX-TAX-001"
+      ],
+      "decisive_refs": [
+        "art-vendor",
+        "art-policy",
+        "EX-TAX-001"
+      ],
+      "unsafe_recommendations": ["release_approved_lines"],
+      "overconservative_recommendations": ["escalate_case"]
+    },
+    "note_expectations": [
+      {
+        "issue_id": "duplicate_cleared",
+        "accepted_reason_sets": [
+          ["possible_duplicate_review", "safe_to_pay"],
+          ["possible_duplicate_review"]
+        ],
+        "decisive_refs": [
+          "duplicate_check:normalized_invoice_no",
+          "CAND-NORM-01"
+        ]
+      },
+      {
+        "issue_id": "receipt_reversal_hold",
+        "accepted_reason_sets": [
+          ["receipt_not_confirmed"]
+        ],
+        "decisive_refs": [
+          "art-history",
+          "EX-RECEIPT-L2"
+        ]
+      },
+      {
+        "issue_id": "tax_hold",
+        "accepted_reason_sets": [
+          ["tax_amount_mismatch"]
+        ],
+        "decisive_refs": [
+          "art-vendor",
+          "art-policy",
+          "EX-TAX-001"
+        ]
+      }
+    ],
+    "efficient_step_target": 18
+  }
+}

data/scenarios/medium.json ADDED Viewed

	@@ -0,0 +1,313 @@

+{
+  "scenario_id": "medium",
+  "task_id": "medium",
+  "case_id": "CASE-MEDIUM-001",
+  "title": "PO invoice with a duplicate control that clears only after number-based review",
+  "description": "Review a PO-backed goods invoice with a possible duplicate flag. The correct action depends on choosing the right duplicate search, interpreting the surfaced candidates, and then deciding whether the invoice can release.",
+  "step_limit": 12,
+  "queue_card": {
+    "case_id": "CASE-MEDIUM-001",
+    "vendor_name": "TechLink Solutions",
+    "vendor_id": "V-315",
+    "invoice_number": "TL-9205/A",
+    "invoice_date": "2026-03-22",
+    "invoice_total": 3800.0,
+    "currency": "USD",
+    "po_number": "PO-29034",
+    "risk_flags": ["po_invoice", "possible_duplicate"],
+    "summary": "PO-backed goods invoice with an open duplicate control."
+  },
+  "artifacts": [
+    {
+      "artifact_id": "art-invoice",
+      "artifact_type": "invoice_packet",
+      "title": "Invoice packet TL-9205/A",
+      "summary": "Two-line goods invoice with PO reference.",
+      "fields": [
+        {"label": "Vendor", "value": "TechLink Solutions"},
+        {"label": "Invoice number", "value": "TL-9205/A"},
+        {"label": "Invoice date", "value": "2026-03-22"},
+        {"label": "PO number", "value": "PO-29034"},
+        {"label": "Payment terms", "value": "Net 30"},
+        {"label": "Gross total", "value": "3800.00 USD"}
+      ],
+      "line_items": [
+        {
+          "line_id": "L1",
+          "description": "Server rack mounting components",
+          "quantity": 6.0,
+          "unit_price": 350.0,
+          "amount": 2100.0,
+          "status": "invoiced",
+          "notes": "PO line 10"
+        },
+        {
+          "line_id": "L2",
+          "description": "Cable management kit",
+          "quantity": 4.0,
+          "unit_price": 425.0,
+          "amount": 1700.0,
+          "status": "invoiced",
+          "notes": "PO line 20"
+        }
+      ],
+      "events": [
+        {
+          "event_id": "evt-received",
+          "event_type": "invoice_received",
+          "event_date": "2026-03-23",
+          "description": "Invoice packet received through EDI channel",
+          "quantity": null,
+          "amount": 3800.0,
+          "status": "queued"
+        }
+      ],
+      "related_refs": ["art-po", "art-receipts", "EX-POSSIBLE-DUP"]
+    },
+    {
+      "artifact_id": "art-po",
+      "artifact_type": "purchase_order",
+      "title": "PO-29034",
+      "summary": "Purchase order for IT infrastructure components.",
+      "fields": [
+        {"label": "Buyer", "value": "IT Procurement"},
+        {"label": "PO number", "value": "PO-29034"},
+        {"label": "Supplier", "value": "TechLink Solutions"},
+        {"label": "Payment terms", "value": "Net 30"}
+      ],
+      "line_items": [
+        {
+          "line_id": "L1",
+          "description": "Server rack mounting components",
+          "quantity": 6.0,
+          "unit_price": 350.0,
+          "amount": 2100.0,
+          "status": "ordered",
+          "notes": "PO line 10"
+        },
+        {
+          "line_id": "L2",
+          "description": "Cable management kit",
+          "quantity": 4.0,
+          "unit_price": 425.0,
+          "amount": 1700.0,
+          "status": "ordered",
+          "notes": "PO line 20"
+        }
+      ],
+      "events": [],
+      "related_refs": ["art-invoice", "art-receipts"]
+    },
+    {
+      "artifact_id": "art-receipts",
+      "artifact_type": "receipt_log",
+      "title": "Receipt log for PO-29034",
+      "summary": "Both lines are fully received.",
+      "fields": [
+        {"label": "Receiving site", "value": "Central IT warehouse"},
+        {"label": "Last receipt update", "value": "2026-03-21"},
+        {"label": "Open receipt issue", "value": "None"}
+      ],
+      "line_items": [
+        {
+          "line_id": "L1",
+          "description": "Server rack mounting components",
+          "quantity": 6.0,
+          "unit_price": null,
+          "amount": 2100.0,
+          "status": "fully_received",
+          "notes": "Received in full on 2026-03-20"
+        },
+        {
+          "line_id": "L2",
+          "description": "Cable management kit",
+          "quantity": 4.0,
+          "unit_price": null,
+          "amount": 1700.0,
+          "status": "fully_received",
+          "notes": "Received in full on 2026-03-21"
+        }
+      ],
+      "events": [
+        {
+          "event_id": "evt-rcv-l1",
+          "event_type": "goods_receipt",
+          "event_date": "2026-03-20",
+          "description": "Received 6 server rack mounting components",
+          "quantity": 6.0,
+          "amount": 2100.0,
+          "status": "posted"
+        },
+        {
+          "event_id": "evt-rcv-l2",
+          "event_type": "goods_receipt",
+          "event_date": "2026-03-21",
+          "description": "Received 4 cable management kits",
+          "quantity": 4.0,
+          "amount": 1700.0,
+          "status": "posted"
+        }
+      ],
+      "related_refs": ["art-po"]
+    },
+    {
+      "artifact_id": "art-vendor",
+      "artifact_type": "vendor_master",
+      "title": "Vendor master: TechLink Solutions",
+      "summary": "Active vendor with no payment hold.",
+      "fields": [
+        {"label": "Vendor ID", "value": "V-315"},
+        {"label": "Payment terms", "value": "Net 30"},
+        {"label": "Vendor status", "value": "Active"},
+        {"label": "Hold status", "value": "No vendor hold"}
+      ],
+      "line_items": [],
+      "events": [],
+      "related_refs": ["art-invoice"]
+    },
+    {
+      "artifact_id": "art-policy",
+      "artifact_type": "policy_card",
+      "title": "AP policy card",
+      "summary": "Duplicate-review precedence rules.",
+      "fields": [
+        {"label": "Duplicate review rule", "value": "When possible_duplicate is flagged, review a number-based duplicate search before relying on heuristic amount/date similarity."},
+        {"label": "Reversed prior record", "value": "A reversed or voided prior record with the same vendor and invoice number is not a payment block."},
+        {"label": "Heuristic amount/date hit", "value": "A same-amount same-date hit with a different invoice number is informational only unless other evidence shows true duplicate billing."}
+      ],
+      "line_items": [],
+      "events": [],
+      "related_refs": ["EX-POSSIBLE-DUP"]
+    }
+  ],
+  "exceptions": [
+    {
+      "exception_id": "EX-POSSIBLE-DUP",
+      "exception_type": "possible_duplicate",
+      "severity": "high",
+      "headline": "Duplicate control is open for this invoice",
+      "impacted_line_ids": ["L1", "L2"],
+      "short_description": "A prior AP record may overlap with this invoice.",
+      "fields": [
+        {"label": "Invoice number", "value": "TL-9205/A"},
+        {"label": "Vendor", "value": "TechLink Solutions"},
+        {"label": "Control status", "value": "Duplicate review required before release"}
+      ],
+      "reviewer_guidance": "Run the relevant duplicate search and review surfaced candidates before deciding."
+    }
+  ],
+  "duplicate_candidates": [
+    {
+      "candidate_id": "CAND-NORM-01",
+      "vendor_name": "TechLink Solutions",
+      "invoice_number": "TL9205A",
+      "invoice_date": "2026-03-10",
+      "gross_amount": 3800.0,
+      "status": "reversed on 2026-03-11 after import duplicate; closed",
+      "match_basis": "Normalized invoice number + vendor + gross amount",
+      "overlap_summary": "Same normalized invoice number. Prior record was reversed before payment.",
+      "supported_match_strategies": [
+        "normalized_invoice_no"
+      ]
+    },
+    {
+      "candidate_id": "CAND-AMT-02",
+      "vendor_name": "TechLink Solutions",
+      "invoice_number": "TL-9188",
+      "invoice_date": "2026-03-22",
+      "gross_amount": 3800.0,
+      "status": "open",
+      "match_basis": "Vendor + gross amount + nearby invoice date",
+      "overlap_summary": "Same amount and nearby date, but invoice number differs.",
+      "supported_match_strategies": [
+        "vendor_amount_date"
+      ]
+    }
+  ],
+  "hidden_truth": {
+    "line_expectations": {
+      "L1": {
+        "amount": 2100.0,
+        "score_map": {
+          "approve": 1.0,
+          "hold": 0.3,
+          "escalate": 0.1,
+          "reject": 0.0
+        },
+        "accepted_reason_sets": [
+          ["matched_to_po_and_receipt", "safe_to_pay"],
+          ["safe_to_pay", "possible_duplicate_review"],
+          ["safe_to_pay"]
+        ],
+        "accepted_routes": [],
+        "gating_refs": [
+          "art-po",
+          "art-receipts",
+          "EX-POSSIBLE-DUP",
+          "duplicate_check:normalized_invoice_no",
+          "CAND-NORM-01"
+        ],
+        "decisive_refs": ["art-po", "art-receipts", "EX-POSSIBLE-DUP", "duplicate_check:normalized_invoice_no", "CAND-NORM-01"],
+        "unsafe_approve": false
+      },
+      "L2": {
+        "amount": 1700.0,
+        "score_map": {
+          "approve": 1.0,
+          "hold": 0.3,
+          "escalate": 0.1,
+          "reject": 0.0
+        },
+        "accepted_reason_sets": [
+          ["matched_to_po_and_receipt", "safe_to_pay"],
+          ["safe_to_pay", "possible_duplicate_review"],
+          ["safe_to_pay"]
+        ],
+        "accepted_routes": [],
+        "gating_refs": [
+          "art-po",
+          "art-receipts",
+          "EX-POSSIBLE-DUP",
+          "duplicate_check:normalized_invoice_no",
+          "CAND-NORM-01"
+        ],
+        "decisive_refs": ["art-po", "art-receipts", "EX-POSSIBLE-DUP", "duplicate_check:normalized_invoice_no", "CAND-NORM-01"],
+        "unsafe_approve": false
+      }
+    },
+    "header_expectation": {
+      "score_map": {
+        "release_approved_lines": 1.0,
+        "hold_full_invoice": 0.35,
+        "escalate_case": 0.15,
+        "reject_full_invoice": 0.0
+      },
+      "accepted_reason_sets": [
+        ["safe_to_pay", "possible_duplicate_review"],
+        ["safe_to_pay"]
+      ],
+      "accepted_routes": [],
+      "gating_refs": [
+        "art-po",
+        "art-receipts",
+        "EX-POSSIBLE-DUP",
+        "duplicate_check:normalized_invoice_no",
+        "CAND-NORM-01"
+      ],
+      "decisive_refs": ["art-po", "art-receipts", "EX-POSSIBLE-DUP", "duplicate_check:normalized_invoice_no", "CAND-NORM-01"],
+      "unsafe_recommendations": [],
+      "overconservative_recommendations": ["hold_full_invoice", "escalate_case"]
+    },
+    "note_expectations": [
+      {
+        "issue_id": "duplicate_cleared",
+        "accepted_reason_sets": [
+          ["possible_duplicate_review", "safe_to_pay"],
+          ["possible_duplicate_review"]
+        ],
+        "decisive_refs": ["duplicate_check:normalized_invoice_no", "CAND-NORM-01"]
+      }
+    ],
+    "efficient_step_target": 9
+  }
+}

data/scenarios/medium_plus.json ADDED Viewed

	@@ -0,0 +1,374 @@

+{
+  "scenario_id": "medium_plus",
+  "task_id": "medium_plus",
+  "case_id": "CASE-MEDIUMPLUS-001",
+  "title": "PO invoice with one receipt-blocked line after duplicate clearance",
+  "description": "Review a PO-backed goods invoice with a possible duplicate flag and a short-received line. The correct action depends on clearing the duplicate, judging the unsupported amount on the short line, and choosing partial release instead of a full invoice hold.",
+  "step_limit": 17,
+  "queue_card": {
+    "case_id": "CASE-MEDIUMPLUS-001",
+    "vendor_name": "Apex Facility Systems",
+    "vendor_id": "V-411",
+    "invoice_number": "AFS-7719/B",
+    "invoice_date": "2026-03-24",
+    "invoice_total": 3050.0,
+    "currency": "USD",
+    "po_number": "PO-55312",
+    "risk_flags": [
+      "po_invoice",
+      "possible_duplicate",
+      "receipt_variance",
+      "partial_receipt"
+    ],
+    "summary": "PO-backed goods invoice with duplicate review open and one receipt support issue."
+  },
+  "artifacts": [
+    {
+      "artifact_id": "art-invoice",
+      "artifact_type": "invoice_packet",
+      "title": "Invoice packet AFS-7719/B",
+      "summary": "Two-line goods invoice with one high-rate short line.",
+      "fields": [
+        {"label": "Vendor", "value": "Apex Facility Systems"},
+        {"label": "Invoice number", "value": "AFS-7719/B"},
+        {"label": "Invoice date", "value": "2026-03-24"},
+        {"label": "PO number", "value": "PO-55312"},
+        {"label": "Payment terms", "value": "Net 30"},
+        {"label": "Gross total", "value": "3050.00 USD"}
+      ],
+      "line_items": [
+        {
+          "line_id": "L1",
+          "description": "Relay control modules",
+          "quantity": 10.0,
+          "unit_price": 185.0,
+          "amount": 1850.0,
+          "status": "invoiced",
+          "notes": "PO line 10"
+        },
+        {
+          "line_id": "L2",
+          "description": "Backup power supply units",
+          "quantity": 5.0,
+          "unit_price": 240.0,
+          "amount": 1200.0,
+          "status": "invoiced",
+          "notes": "PO line 20"
+        }
+      ],
+      "events": [
+        {
+          "event_id": "evt-received",
+          "event_type": "invoice_received",
+          "event_date": "2026-03-25",
+          "description": "Invoice packet received through EDI channel",
+          "quantity": null,
+          "amount": 3050.0,
+          "status": "queued"
+        }
+      ],
+      "related_refs": [
+        "art-po",
+        "art-receipts",
+        "art-policy",
+        "EX-POSSIBLE-DUP",
+        "EX-RECEIPT-L2"
+      ]
+    },
+    {
+      "artifact_id": "art-po",
+      "artifact_type": "purchase_order",
+      "title": "PO-55312",
+      "summary": "Purchase order for facility control hardware.",
+      "fields": [
+        {"label": "Buyer", "value": "Facilities Procurement"},
+        {"label": "PO number", "value": "PO-55312"},
+        {"label": "Supplier", "value": "Apex Facility Systems"},
+        {"label": "Payment terms", "value": "Net 30"}
+      ],
+      "line_items": [
+        {
+          "line_id": "L1",
+          "description": "Relay control modules",
+          "quantity": 10.0,
+          "unit_price": 185.0,
+          "amount": 1850.0,
+          "status": "ordered",
+          "notes": "PO line 10"
+        },
+        {
+          "line_id": "L2",
+          "description": "Backup power supply units",
+          "quantity": 5.0,
+          "unit_price": 240.0,
+          "amount": 1200.0,
+          "status": "ordered",
+          "notes": "PO line 20"
+        }
+      ],
+      "events": [],
+      "related_refs": ["art-invoice", "art-receipts"]
+    },
+    {
+      "artifact_id": "art-receipts",
+      "artifact_type": "receipt_log",
+      "title": "Receipt log for PO-55312",
+      "summary": "One line is fully received and one line remains one unit short.",
+      "fields": [
+        {"label": "Receiving site", "value": "South regional warehouse"},
+        {"label": "Last receipt update", "value": "2026-03-23"},
+        {"label": "Open receipt issue", "value": "PO line 20 remains one unit short"}
+      ],
+      "line_items": [
+        {
+          "line_id": "L1",
+          "description": "Relay control modules",
+          "quantity": 10.0,
+          "unit_price": null,
+          "amount": 1850.0,
+          "status": "fully_received",
+          "notes": "Received in full on 2026-03-22"
+        },
+        {
+          "line_id": "L2",
+          "description": "Backup power supply units",
+          "quantity": 4.0,
+          "unit_price": null,
+          "amount": null,
+          "status": "short_received",
+          "notes": "4 of 5 units posted on 2026-03-23; remaining unit not yet received"
+        }
+      ],
+      "events": [
+        {
+          "event_id": "evt-rcv-l1",
+          "event_type": "goods_receipt",
+          "event_date": "2026-03-22",
+          "description": "Received 10 relay control modules",
+          "quantity": 10.0,
+          "amount": 1850.0,
+          "status": "posted"
+        },
+        {
+          "event_id": "evt-rcv-l2",
+          "event_type": "goods_receipt",
+          "event_date": "2026-03-23",
+          "description": "Received 4 backup power supply units",
+          "quantity": 4.0,
+          "amount": null,
+          "status": "posted"
+        }
+      ],
+      "related_refs": ["art-po", "art-invoice", "EX-RECEIPT-L2"]
+    },
+    {
+      "artifact_id": "art-policy",
+      "artifact_type": "policy_card",
+      "title": "AP policy card",
+      "summary": "Duplicate, de minimis receipt shortage, and partial release rules.",
+      "fields": [
+        {"label": "Duplicate review rule", "value": "When possible_duplicate is flagged, review a normalized invoice number match before relying on heuristic amount/date similarity."},
+        {"label": "Reversed prior record", "value": "A reversed or voided prior record with the same vendor and invoice number is not a payment block."},
+        {"label": "De minimis receipt shortage", "value": "A line may release only when unsupported amount is 150.00 USD or less. If unsupported amount exceeds that threshold, hold the line to Receiving."},
+        {"label": "Partial release rule", "value": "If only specific lines remain unsupported and no case-level blocker exists, hold the affected lines and release the approved lines instead of holding the full invoice."}
+      ],
+      "line_items": [],
+      "events": [],
+      "related_refs": ["EX-POSSIBLE-DUP", "EX-RECEIPT-L2"]
+    }
+  ],
+  "exceptions": [
+    {
+      "exception_id": "EX-POSSIBLE-DUP",
+      "exception_type": "possible_duplicate",
+      "severity": "high",
+      "headline": "Duplicate control is open for this invoice",
+      "impacted_line_ids": ["L1", "L2"],
+      "short_description": "A prior AP record may overlap with this invoice.",
+      "fields": [
+        {"label": "Invoice number", "value": "AFS-7719/B"},
+        {"label": "Vendor", "value": "Apex Facility Systems"},
+        {"label": "Control status", "value": "Duplicate review required before release"}
+      ],
+      "reviewer_guidance": "Run the relevant duplicate search and review surfaced candidates before deciding."
+    },
+    {
+      "exception_id": "EX-RECEIPT-L2",
+      "exception_type": "receipt_quantity_variance",
+      "severity": "high",
+      "headline": "Receipt support is short on L2",
+      "impacted_line_ids": ["L2"],
+      "short_description": "Received quantity on L2 is below the invoiced quantity.",
+      "fields": [
+        {"label": "Invoice quantity", "value": "5"},
+        {"label": "Received quantity", "value": "4"},
+        {"label": "Short quantity", "value": "1"}
+      ],
+      "reviewer_guidance": "Review the invoiced unit rate, receipt log, and shortage rule before deciding whether L2 can release."
+    }
+  ],
+  "duplicate_candidates": [
+    {
+      "candidate_id": "CAND-NORM-01",
+      "vendor_name": "Apex Facility Systems",
+      "invoice_number": "AFS7719B",
+      "invoice_date": "2026-03-13",
+      "gross_amount": 3050.0,
+      "status": "reversed on 2026-03-14 after import duplicate; closed",
+      "match_basis": "Normalized invoice number + vendor + gross amount",
+      "overlap_summary": "Same normalized invoice number. Prior record was reversed before payment.",
+      "supported_match_strategies": [
+        "normalized_invoice_no"
+      ]
+    },
+    {
+      "candidate_id": "CAND-AMT-02",
+      "vendor_name": "Apex Facility Systems",
+      "invoice_number": "AFS-7688",
+      "invoice_date": "2026-03-24",
+      "gross_amount": 3050.0,
+      "status": "open",
+      "match_basis": "Vendor + gross amount + nearby invoice date",
+      "overlap_summary": "Same amount and nearby date, but invoice number differs.",
+      "supported_match_strategies": [
+        "vendor_amount_date"
+      ]
+    }
+  ],
+  "hidden_truth": {
+    "line_expectations": {
+      "L1": {
+        "amount": 1850.0,
+        "score_map": {
+          "approve": 1.0,
+          "hold": 0.25,
+          "escalate": 0.1,
+          "reject": 0.0
+        },
+        "accepted_reason_sets": [
+          ["matched_to_po_and_receipt", "safe_to_pay"],
+          ["safe_to_pay"],
+          ["possible_duplicate_review", "safe_to_pay"]
+        ],
+        "accepted_routes": [],
+        "gating_refs": [
+          "art-po",
+          "art-receipts",
+          "EX-POSSIBLE-DUP",
+          "duplicate_check:normalized_invoice_no",
+          "CAND-NORM-01"
+        ],
+        "safe_gating_refs": [
+          "art-po",
+          "art-receipts",
+          "EX-POSSIBLE-DUP",
+          "duplicate_check:normalized_invoice_no",
+          "CAND-NORM-01"
+        ],
+        "decisive_refs": [
+          "art-po",
+          "art-receipts",
+          "EX-POSSIBLE-DUP",
+          "duplicate_check:normalized_invoice_no",
+          "CAND-NORM-01"
+        ],
+        "unsafe_approve": false
+      },
+      "L2": {
+        "amount": 1200.0,
+        "score_map": {
+          "hold": 1.0,
+          "escalate": 0.45,
+          "reject": 0.1,
+          "approve": 0.0
+        },
+        "accepted_reason_sets": [
+          ["receipt_not_confirmed"],
+          ["receipt_not_confirmed", "partial_receipt_pending"],
+          ["partial_receipt_pending"]
+        ],
+        "accepted_routes": ["receiving"],
+        "gating_refs": [
+          "art-invoice",
+          "art-receipts",
+          "EX-RECEIPT-L2",
+          "art-policy"
+        ],
+        "safe_gating_refs": [
+          "art-receipts",
+          "EX-RECEIPT-L2"
+        ],
+        "decisive_refs": [
+          "art-invoice",
+          "art-receipts",
+          "EX-RECEIPT-L2",
+          "art-policy"
+        ],
+        "unsafe_approve": true
+      }
+    },
+    "header_expectation": {
+      "score_map": {
+        "release_approved_lines": 1.0,
+        "hold_full_invoice": 0.55,
+        "escalate_case": 0.35,
+        "reject_full_invoice": 0.0
+      },
+      "accepted_reason_sets": [
+        ["possible_duplicate_review", "receipt_not_confirmed", "safe_to_pay"],
+        ["possible_duplicate_review", "partial_receipt_pending", "safe_to_pay"],
+        ["receipt_not_confirmed", "safe_to_pay"]
+      ],
+      "accepted_routes": [],
+      "gating_refs": [
+        "art-policy",
+        "art-receipts",
+        "EX-RECEIPT-L2",
+        "duplicate_check:normalized_invoice_no",
+        "CAND-NORM-01"
+      ],
+      "safe_gating_refs": [
+        "art-receipts",
+        "EX-RECEIPT-L2",
+        "duplicate_check:normalized_invoice_no",
+        "CAND-NORM-01"
+      ],
+      "decisive_refs": [
+        "art-policy",
+        "art-receipts",
+        "EX-RECEIPT-L2",
+        "duplicate_check:normalized_invoice_no",
+        "CAND-NORM-01"
+      ],
+      "unsafe_recommendations": [],
+      "overconservative_recommendations": ["hold_full_invoice", "escalate_case"]
+    },
+    "note_expectations": [
+      {
+        "issue_id": "duplicate_cleared",
+        "accepted_reason_sets": [
+          ["possible_duplicate_review", "safe_to_pay"],
+          ["possible_duplicate_review"]
+        ],
+        "decisive_refs": [
+          "duplicate_check:normalized_invoice_no",
+          "CAND-NORM-01"
+        ]
+      },
+      {
+        "issue_id": "receipt_short_hold",
+        "accepted_reason_sets": [
+          ["receipt_not_confirmed"],
+          ["partial_receipt_pending"]
+        ],
+        "decisive_refs": [
+          "art-invoice",
+          "art-receipts",
+          "art-policy",
+          "EX-RECEIPT-L2"
+        ]
+      }
+    ],
+    "efficient_step_target": 13
+  }
+}

docs/rules.md ADDED Viewed

	@@ -0,0 +1,560 @@

+## Round 1 — Problem Statement
+### The Task
+Build a complete, real-world **OpenEnv** environment that an AI agent can learn from through the standard `step()` / `reset()` / `state()` API.
+### Key Requirements at a Glance
+* **Real-world Focus:** Must simulate a real-world task (not games or toys).
+* **Full Spec:** Implement full OpenEnv spec: typed models, `step()`/`reset()`/`state()`, and `openenv.yaml`.
+* **Tasks:** Minimum 3 tasks with agent graders (easy → medium → hard, scores 0.0–1.0).
+* **Rewards:** Meaningful reward function with partial progress signals.
+* **Baselines:** Baseline inference script with reproducible scores.
+* **Deployment:** Deploy to Hugging Face Spaces + working Dockerfile.
+* **Docs:** README with environment description, action/observation spaces, and setup instructions.
+---
+### Functional Requirements
+#### 1. Real-world task simulation
+The environment must simulate a task humans actually do.
+* **Examples:** Email triage, code review, data cleaning, scheduling, customer support, content moderation.
+#### 2. OpenEnv spec compliance
+Implement the full OpenEnv interface:
+* **Typed Models:** Observation, Action, and Reward Pydantic models.
+* **Methods:** `step(action)` → returns observation, reward, done, info; `reset()` → returns initial observation; `state()` → returns current state.
+* **Metadata:** `openenv.yaml` with metadata, tested via `openenv validate`.
+#### 3. Minimum 3 tasks with agent graders
+Each task defines a concrete objective with a programmatic grader (0.0–1.0).
+* **Progression:** Easy → Medium → Hard.
+* **Criteria:** Graders must have clear, deterministic success/failure criteria.
+#### 4. Meaningful reward function
+* Provides signal over the full trajectory (not just binary end-of-episode).
+* Rewards partial progress toward task completion.
+* Penalizes clearly undesirable behavior (e.g., infinite loops, destructive actions).
+#### 5. Baseline inference script
+* Uses the OpenAI API client to run a model against the environment.
+* Produces a reproducible baseline score on all public tasks.
+---
+### Non-Functional Requirements
+* **Hugging Face Spaces:** Environment must run as a containerized HF Space tagged with `openenv`.
+* **Containerization:** Must include a working `Dockerfile` that starts cleanly.
+* **Documentation:** README must include environment description, action/observation space definitions, task descriptions, and setup instructions.
+---
+### Scoring Rubric
+| Parameter                          | Weight | Description                                                               |
+| :--------------------------------- | :----- | :------------------------------------------------------------------------ |
+| **Real-world utility**             | 30%    | Does the environment model a genuine task useful for training/evaluation? |
+| **Task & grader quality**          | 25%    | Well-defined objectives? Fair measurement? Difficulty progression?        |
+| **Environment design**             | 20%    | Clean state management, sensible spaces, good reward shaping.             |
+| **Code quality & spec compliance** | 15%    | Follows OpenEnv spec, clean project structure, working Dockerfile.        |
+| **Creativity & novelty**           | 10%    | Novel problem domain or interesting mechanics.                            |
+### Scoring Breakdown
+**Real-world utility (30%)**
+* **0–5:** Toy/artificial problem with no practical application
+* **6–15:** Valid domain but shallow modeling of the real task
+* **16–25:** Good domain modeling, would be useful for agent evaluation
+* **26–30:** Excellent — fills a real gap, immediate value for the RL/agent community
+**Task & grader quality (25%)**
+* 3+ tasks with difficulty range?
+* Graders produce scores between 0.0–1.0?
+* Graders deterministic and reproducible?
+* Hard task genuinely challenges frontier models?
+**Environment design (20%)**
+* `reset()` produces clean state?
+* Action/observation types well-designed and documented?
+* Reward function provides useful varying signal (not just sparse)?
+* Episode boundaries sensible?
+**Code quality & spec compliance (15%)**
+* `openenv validate` passes?
+* `docker build && docker run` works?
+* HF Space deploys and responds?
+* Baseline script runs and reproduces scores?
+**Creativity & novelty (10%)**
+* Domain we haven’t seen in OpenEnv before?
+* Reward design has interesting properties?
+* Clever mechanics that make the environment engaging?
+---
+### How judging works
+Phase 1: Automated Validation
+Pass/fail gate — HF Space deploys, OpenEnv spec compliance, Dockerfile builds, baseline reproduces, 3+ tasks with graders.
+Phase 2: Agentic Evaluation
+Scored — baseline agent re-run, standard Open LLM agent (e.g. Nemotron 3 Super) run against all environments, score variance check.
+Phase 3: Human Review
+Top submissions reviewed by Meta and Hugging Face engineers for real-world utility, creativity, and exploit checks.
+Disqualification Criteria
+Environment does not deploy or respond
+Plagiarized or trivially modified existing environments
+Graders that always return the same score
+---
+### Pre-Submission Checklist
+**CRITICAL: All checks must pass during automated validation or you will be disqualified.**
+* **[ ] HF Space Deploys:** An automated ping to your Space URL must return an **HTTP 200** and successfully respond to a `/reset` call.
+* **[ ] OpenEnv Spec Compliance:** Your environment must pass validation for `openenv.yaml`, typed Pydantic models, and the required `step()`, `reset()`, and `state()` endpoints.
+* **[ ] Dockerfile Builds:** The automated system will run a `docker build` on your submitted repository; it must complete successfully.
+* **[ ] Baseline Reproduces:** The system will execute your `inference.py`. It must run without errors and produce scores for all tasks.
+* **[ ] 3+ Tasks with Graders:** The system will enumerate your tasks and run every grader to verify that scores fall strictly within the **0.0–1.0** range.
+### Mandatory Configuration
+Before submitting, ensure the following variables are defined in your environment configuration:
+API_BASE_URL   The API endpoint for the LLM.
+MODEL_NAME     The model identifier to use for inference.
+HF_TOKEN       Your Hugging Face / API key.
+The inference script must be named `inference.py` and placed in the root directory of the project
+Participants must use OpenAI Client for all LLM calls using above variables
+Participants must emit structured stdout logs strictly following the [START], [STEP], and [END] format defined in the sample inference.py provided below. Any deviation in field names, ordering, or formatting will result in incorrect evaluation scoring. Refer to the Sample Inference Script for the complete format specification and examples
+### Infrastructure & Runtime Restrictions
+To ensure fair and stable evaluation, your submission must adhere to these limits:
+* **Inference Time:** The total runtime of the `inference.py` script must be **less than 20 minutes**.
+* **Hardware Constraints:** Your environment and inference script must be able to run on a machine with:
+    * **vCPU:** 2
+    * **Memory:** 8GB RAM
+### Validator
+It is highly recommended that you **run the pre-submission validation script** locally (provided in the "Pre-Validation Script" section) before final submission to catch any Docker or spec errors early.
+---
+### FAQs
+**How are submissions evaluated?**
+Submissions are evaluated based on runtime correctness (runs without errors), interface compliance (follows OpenEnv standard), task design (clear and realistic), and grading logic (meaningful reward system).
+**What framework must be used?**
+Participants must use the **OpenEnv** framework. For LLM calls within the inference script, the **OpenAI Client** is mandatory.
+**What do I need to submit?**
+You must submit the URL to your containerized Hugging Face Space. Ensure your repository includes the `openenv.yaml` file, a working `Dockerfile`, and an `inference.py` script in the root directory.
+**Where can I get help?**
+You can join the [Discord Community](https://discord.gg/Dedhy5pkWD) for mentor access and announcements, or email the support team at `help_openenvhackathon@scaler.com`.
+---
+### Inference Script Example (`inference.py`)
+```python
+"""
+Inference Script Example
+===================================
+MANDATORY
+- Before submitting, ensure the following variables are defined in your environment configuration:
+    API_BASE_URL   The API endpoint for the LLM.
+    MODEL_NAME     The model identifier to use for inference.
+    HF_TOKEN       Your Hugging Face / API key.
+    LOCAL_IMAGE_NAME The name of the local image to use for the environment if you are using from_docker_image()
+                     method
+- Defaults are set only for API_BASE_URL and MODEL_NAME
+    (and should reflect your active inference setup):
+    API_BASE_URL = os.getenv("API_BASE_URL", "<your-active-endpoint>")
+    MODEL_NAME = os.getenv("MODEL_NAME", "<your-active-model>")
+- The inference script must be named `inference.py` and placed in the root directory of the project
+- Participants must use OpenAI Client for all LLM calls using above variables
+STDOUT FORMAT
+- The script must emit exactly three line types to stdout, in this order:
+    [START] task=<task_name> env=<benchmark> model=<model_name>
+    [STEP]  step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
+    [END]   success=<true|false> steps=<n> score=<score> rewards=<r1,r2,...,rn>
+  Rules:
+    - One [START] line at episode begin.
+    - One [STEP] line per step, immediately after env.step() returns.
+    - One [END] line after env.close(), always emitted (even on exception).
+    - reward and rewards are formatted to 2 decimal places.
+    - done and success are lowercase booleans: true or false.
+    - error is the raw last_action_error string, or null if none.
+    - All fields on a single line with no newlines within a line.
+    - Each tasks should return score in [0, 1]
+  Example:
+    [START] task=click-test env=miniwob model=Qwen3-VL-30B
+    [STEP] step=1 action=click('123') reward=0.00 done=false error=null
+    [STEP] step=2 action=fill('456','text') reward=0.00 done=false error=null
+    [STEP] step=3 action=click('789') reward=1.00 done=true error=null
+    [END] success=true steps=3 score=1.00 rewards=0.00,0.00,1.00
+"""
+import asyncio
+import os
+import textwrap
+from typing import List, Optional
+from openai import OpenAI
+from my_env_v4 import MyEnvV4Action, MyEnvV4Env
+IMAGE_NAME = os.getenv("IMAGE_NAME") # If you are using docker image
+API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
+API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
+MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
+TASK_NAME = os.getenv("MY_ENV_V4_TASK", "echo")
+BENCHMARK = os.getenv("MY_ENV_V4_BENCHMARK", "my_env_v4")
+MAX_STEPS = 8
+TEMPERATURE = 0.7
+MAX_TOKENS = 150
+SUCCESS_SCORE_THRESHOLD = 0.1  # normalized score in [0, 1]
+# Max possible reward: each token contributes 0.1, across all steps
+_MAX_REWARD_PER_STEP = MAX_TOKENS * 0.1
+MAX_TOTAL_REWARD = MAX_STEPS * _MAX_REWARD_PER_STEP
+SYSTEM_PROMPT = textwrap.dedent(
+    """
+    You are interacting with a simple echo environment.
+    Each turn you must send a message. The environment will echo it back.
+    Reward is proportional to message length: reward = len(message) * 0.1
+    Your goal is to maximize total reward by sending meaningful, substantive messages.
+    Reply with exactly one message string — no quotes, no prefixes, just the message text.
+    """
+).strip()
+def log_start(task: str, env: str, model: str) -> None:
+    print(f"[START] task={task} env={env} model={model}", flush=True)
+def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
+    error_val = error if error else "null"
+    done_val = str(done).lower()
+    print(
+        f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
+        flush=True,
+    )
+def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
+    rewards_str = ",".join(f"{r:.2f}" for r in rewards)
+    print(f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)
+def build_user_prompt(step: int, last_echoed: str, last_reward: float, history: List[str]) -> str:
+    history_block = "\n".join(history[-4:]) if history else "None"
+    return textwrap.dedent(
+        f"""
+        Step: {step}
+        Last echoed message: {last_echoed!r}
+        Last reward: {last_reward:.2f}
+        Previous steps:
+        {history_block}
+        Send your next message.
+        """
+    ).strip()
+def get_model_message(client: OpenAI, step: int, last_echoed: str, last_reward: float, history: List[str]) -> str:
+    user_prompt = build_user_prompt(step, last_echoed, last_reward, history)
+    try:
+        completion = client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user", "content": user_prompt},
+            ],
+            temperature=TEMPERATURE,
+            max_tokens=MAX_TOKENS,
+            stream=False,
+        )
+        text = (completion.choices[0].message.content or "").strip()
+        return text if text else "hello"
+    except Exception as exc:
+        print(f"[DEBUG] Model request failed: {exc}", flush=True)
+        return "hello"
+async def main() -> None:
+    client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
+    env = await MyEnvV4Env.from_docker_image(IMAGE_NAME)
+    history: List[str] = []
+    rewards: List[float] = []
+    steps_taken = 0
+    score = 0.0
+    success = False
+    log_start(task=TASK_NAME, env=BENCHMARK, model=MODEL_NAME)
+    try:
+        result = await env.reset() # OpenENV.reset()
+        last_echoed = result.observation.echoed_message
+        last_reward = 0.0
+        for step in range(1, MAX_STEPS + 1):
+            if result.done:
+                break
+            message = get_model_message(client, step, last_echoed, last_reward, history)
+            result = await env.step(MyEnvV4Action(message=message))
+            obs = result.observation
+            reward = result.reward or 0.0
+            done = result.done
+            error = None
+            rewards.append(reward)
+            steps_taken = step
+            last_echoed = obs.echoed_message
+            last_reward = reward
+            log_step(step=step, action=message, reward=reward, done=done, error=error)
+            history.append(f"Step {step}: {message!r} -> reward {reward:+.2f}")
+            if done:
+                break
+        score = sum(rewards) / MAX_TOTAL_REWARD if MAX_TOTAL_REWARD > 0 else 0.0
+        score = min(max(score, 0.0), 1.0)  # clamp to [0, 1]
+        success = score >= SUCCESS_SCORE_THRESHOLD
+    finally:
+        try:
+            await env.close()
+        except Exception as e:
+            print(f"[DEBUG] env.close() error (container cleanup): {e}", flush=True)
+        log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+---
+### Pre-Validation Script
+```bash
+#!/usr/bin/env bash
+#
+# validate-submission.sh — OpenEnv Submission Validator
+#
+# Checks that your HF Space is live, Docker image builds, and openenv validate passes.
+#
+# Prerequisites:
+#   - Docker:       https://docs.docker.com/get-docker/
+#   - openenv-core: pip install openenv-core
+#   - curl (usually pre-installed)
+#
+# Run:
+#   curl -fsSL https://raw.githubusercontent.com/<owner>/<repo>/main/scripts/validate-submission.sh | bash -s -- <ping_url> [repo_dir]
+#
+#   Or download and run locally:
+#     chmod +x validate-submission.sh
+#     ./validate-submission.sh <ping_url> [repo_dir]
+#
+# Arguments:
+#   ping_url   Your HuggingFace Space URL (e.g. https://your-space.hf.space)
+#   repo_dir   Path to your repo (default: current directory)
+#
+# Examples:
+#   ./validate-submission.sh https://my-team.hf.space
+#   ./validate-submission.sh https://my-team.hf.space ./my-repo
+#
+set -uo pipefail
+DOCKER_BUILD_TIMEOUT=600
+if [ -t 1 ]; then
+  RED='\033[0;31m'
+  GREEN='\033[0;32m'
+  YELLOW='\033[1;33m'
+  BOLD='\033[1m'
+  NC='\033[0m'
+else
+  RED='' GREEN='' YELLOW='' BOLD='' NC=''
+fi
+run_with_timeout() {
+  local secs="$1"; shift
+  if command -v timeout &>/dev/null; then
+    timeout "$secs" "$@"
+  elif command -v gtimeout &>/dev/null; then
+    gtimeout "$secs" "$@"
+  else
+    "$@" &
+    local pid=$!
+    ( sleep "$secs" && kill "$pid" 2>/dev/null ) &
+    local watcher=$!
+    wait "$pid" 2>/dev/null
+    local rc=$?
+    kill "$watcher" 2>/dev/null
+    wait "$watcher" 2>/dev/null
+    return $rc
+  fi
+}
+portable_mktemp() {
+  local prefix="${1:-validate}"
+  mktemp "${TMPDIR:-/tmp}/${prefix}-XXXXXX" 2>/dev/null || mktemp
+}
+CLEANUP_FILES=()
+cleanup() { rm -f "${CLEANUP_FILES[@]+"${CLEANUP_FILES[@]}"}"; }
+trap cleanup EXIT
+PING_URL="${1:-}"
+REPO_DIR="${2:-.}"
+if [ -z "$PING_URL" ]; then
+  printf "Usage: %s <ping_url> [repo_dir]\n" "$0"
+  printf "\n"
+  printf "  ping_url   Your HuggingFace Space URL (e.g. https://your-space.hf.space)\n"
+  printf "  repo_dir   Path to your repo (default: current directory)\n"
+  exit 1
+fi
+if ! REPO_DIR="$(cd "$REPO_DIR" 2>/dev/null && pwd)"; then
+  printf "Error: directory '%s' not found\n" "${2:-.}"
+  exit 1
+fi
+PING_URL="${PING_URL%/}"
+export PING_URL
+PASS=0
+log()  { printf "[%s] %b\n" "$(date -u +%H:%M:%S)" "$*"; }
+pass() { log "${GREEN}PASSED${NC} -- $1"; PASS=$((PASS + 1)); }
+fail() { log "${RED}FAILED${NC} -- $1"; }
+hint() { printf "  ${YELLOW}Hint:${NC} %b\n" "$1"; }
+stop_at() {
+  printf "\n"
+  printf "${RED}${BOLD}Validation stopped at %s.${NC} Fix the above before continuing.\n" "$1"
+  exit 1
+}
+printf "\n"
+printf "${BOLD}========================================${NC}\n"
+printf "${BOLD}  OpenEnv Submission Validator${NC}\n"
+printf "${BOLD}========================================${NC}\n"
+log "Repo:     $REPO_DIR"
+log "Ping URL: $PING_URL"
+printf "\n"
+log "${BOLD}Step 1/3: Pinging HF Space${NC} ($PING_URL/reset) ..."
+CURL_OUTPUT=$(portable_mktemp "validate-curl")
+CLEANUP_FILES+=("$CURL_OUTPUT")
+HTTP_CODE=$(curl -s -o "$CURL_OUTPUT" -w "%{http_code}" -X POST \
+  -H "Content-Type: application/json" -d '{}' \
+  "$PING_URL/reset" --max-time 30 2>"$CURL_OUTPUT" || printf "000")
+if [ "$HTTP_CODE" = "200" ]; then
+  pass "HF Space is live and responds to /reset"
+elif [ "$HTTP_CODE" = "000" ]; then
+  fail "HF Space not reachable (connection failed or timed out)"
+  hint "Check your network connection and that the Space is running."
+  hint "Try: curl -s -o /dev/null -w '%%{http_code}' -X POST $PING_URL/reset"
+  stop_at "Step 1"
+else
+  fail "HF Space /reset returned HTTP $HTTP_CODE (expected 200)"
+  hint "Make sure your Space is running and the URL is correct."
+  hint "Try opening $PING_URL in your browser first."
+  stop_at "Step 1"
+fi
+log "${BOLD}Step 2/3: Running docker build${NC} ..."
+if ! command -v docker &>/dev/null; then
+  fail "docker command not found"
+  hint "Install Docker: https://docs.docker.com/get-docker/"
+  stop_at "Step 2"
+fi
+if [ -f "$REPO_DIR/Dockerfile" ]; then
+  DOCKER_CONTEXT="$REPO_DIR"
+elif [ -f "$REPO_DIR/server/Dockerfile" ]; then
+  DOCKER_CONTEXT="$REPO_DIR/server"
+else
+  fail "No Dockerfile found in repo root or server/ directory"
+  stop_at "Step 2"
+fi
+log "  Found Dockerfile in $DOCKER_CONTEXT"
+BUILD_OK=false
+BUILD_OUTPUT=$(run_with_timeout "$DOCKER_BUILD_TIMEOUT" docker build "$DOCKER_CONTEXT" 2>&1) && BUILD_OK=true
+if [ "$BUILD_OK" = true ]; then
+  pass "Docker build succeeded"
+else
+  fail "Docker build failed (timeout=${DOCKER_BUILD_TIMEOUT}s)"
+  printf "%s\n" "$BUILD_OUTPUT" | tail -20
+  stop_at "Step 2"
+fi
+log "${BOLD}Step 3/3: Running openenv validate${NC} ..."
+if ! command -v openenv &>/dev/null; then
+  fail "openenv command not found"
+  hint "Install it: pip install openenv-core"
+  stop_at "Step 3"
+fi
+VALIDATE_OK=false
+VALIDATE_OUTPUT=$(cd "$REPO_DIR" && openenv validate 2>&1) && VALIDATE_OK=true
+if [ "$VALIDATE_OK" = true ]; then
+  pass "openenv validate passed"
+  [ -n "$VALIDATE_OUTPUT" ] && log "  $VALIDATE_OUTPUT"
+else
+  fail "openenv validate failed"
+  printf "%s\n" "$VALIDATE_OUTPUT"
+  stop_at "Step 3"
+fi
+printf "\n"
+printf "${BOLD}========================================${NC}\n"
+printf "${GREEN}${BOLD}  All 3/3 checks passed!${NC}\n"
+printf "${GREEN}${BOLD}  Your submission is ready to submit.${NC}\n"
+printf "${BOLD}========================================${NC}\n"
+printf "\n"
+exit 0
+```
+---
+REQUIREMENTS:
+- Must use models available on HuggingFace only
+- Use openenv cli to stay compliant

inference.py ADDED Viewed

	@@ -0,0 +1,1067 @@

+"""Reproducible baseline for InvoiceOps."""
+from __future__ import annotations
+import json
+import os
+import re
+import sys
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Callable, TypeVar
+from openai import OpenAI
+from invoiceops_env import InvoiceOpsAction, InvoiceOpsEnv
+from invoiceops_env.models import (
+    ActionType,
+    Disposition,
+    DuplicateCandidate,
+    DuplicateMatchStrategy,
+    ExceptionDetail,
+    InvoiceOpsObservation,
+    NoteType,
+    PaymentRecommendation,
+    QueueCard,
+    ReasonCode,
+    RouteTarget,
+    TaskId,
+)
+ENV_URL = os.getenv("ENV_URL", "http://localhost:8000")
+DEFAULT_HF_ROUTER_BASE_URL = "https://router.huggingface.co/v1"
+API_BASE_URL = os.getenv("API_BASE_URL", DEFAULT_HF_ROUTER_BASE_URL)
+MODEL_NAME = os.getenv("MODEL_NAME", "zai-org/GLM-5.1")
+TEMPERATURE = 0.0
+MAX_TOKENS = int(os.getenv("MAX_TOKENS", "3000"))
+RETRY_MAX_TOKENS = max(MAX_TOKENS, int(os.getenv("RETRY_MAX_TOKENS", "5000")))
+MAX_MODEL_ATTEMPTS = 2
+BENCHMARK = "invoiceops_env"
+OUTPUT_DIR = Path(__file__).resolve().parent / "outputs" / "evals"
+EVAL_RUN_NAME = os.getenv("EVAL_RUN_NAME")
+TASKS = [
+    TaskId.EASY,
+    TaskId.MEDIUM,
+    TaskId.MEDIUM_PLUS,
+    TaskId.HARD,
+]
+HEADER_DISPOSITION_MAP: dict[Disposition, PaymentRecommendation] = {
+    Disposition.APPROVE: PaymentRecommendation.RELEASE_APPROVED_LINES,
+    Disposition.HOLD: PaymentRecommendation.HOLD_FULL_INVOICE,
+    Disposition.REJECT: PaymentRecommendation.REJECT_FULL_INVOICE,
+    Disposition.ESCALATE: PaymentRecommendation.ESCALATE_CASE,
+}
+ParsedModelOutput = TypeVar("ParsedModelOutput")
+def _env_flag(name: str, default: bool) -> bool:
+    raw_value = os.getenv(name)
+    if raw_value is None:
+        return default
+    return raw_value.strip().lower() not in {"0", "false", "no", "off", ""}
+def strict_task_score(raw_score: float, *, used_fallback: bool) -> float:
+    if used_fallback and _env_flag("STRICT_BASELINE_SCORING", True):
+        return 0.0
+    return raw_score
+@dataclass
+class EpisodeTrace:
+    rewards: list[float] = field(default_factory=list)
+    steps_taken: int = 0
+@dataclass
+class ObservationMemory:
+    opened_artifacts: dict[str, Any] = field(default_factory=dict)
+    inspected_exceptions: dict[str, ExceptionDetail] = field(default_factory=dict)
+    duplicate_candidates: list[DuplicateCandidate] = field(default_factory=list)
+def resolve_api_key() -> tuple[str | None, str | None]:
+    token = os.getenv("HF_TOKEN")
+    return (token, "HF_TOKEN") if token else (None, None)
+def _slugify(value: str) -> str:
+    slug = re.sub(r"[^A-Za-z0-9._-]+", "-", value.strip())
+    slug = slug.strip("-._")
+    return slug or "run"
+def build_output_path(model_name: str) -> tuple[str, Path]:
+    timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
+    run_id = _slugify(EVAL_RUN_NAME) if EVAL_RUN_NAME else timestamp
+    model_slug = _slugify(model_name)
+    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+    candidate = OUTPUT_DIR / f"{run_id}__{model_slug}.json"
+    suffix = 2
+    while candidate.exists():
+        candidate = OUTPUT_DIR / f"{run_id}__{model_slug}__{suffix}.json"
+        suffix += 1
+    return run_id, candidate
+def _sanitize_log_value(value: str | None) -> str:
+    if not value:
+        return "null"
+    return value.replace("\n", " ").strip() or "null"
+def format_action_for_log(action: InvoiceOpsAction) -> str:
+    return json.dumps(
+        action.model_dump(mode="json", exclude_none=True),
+        separators=(",", ":"),
+        sort_keys=True,
+    )
+def _extract_step_error(
+    observation: InvoiceOpsObservation | None,
+    *,
+    previous_invalid_actions: int,
+) -> str | None:
+    if observation is None:
+        return None
+    if observation.progress.invalid_actions > previous_invalid_actions:
+        return observation.message or None
+    return None
+def log_start(task: str, env: str, model: str) -> None:
+    print(f"[START] task={task} env={env} model={model}", flush=True)
+def log_step(
+    step: int, action: str, reward: float, done: bool, error: str | None
+) -> None:
+    print(
+        f"[STEP] step={step} action={action} reward={reward:.2f} "
+        f"done={str(done).lower()} error={_sanitize_log_value(error)}",
+        flush=True,
+    )
+def log_end(success: bool, steps: int, score: float, rewards: list[float]) -> None:
+    rewards_str = ",".join(f"{reward:.2f}" for reward in rewards)
+    print(
+        f"[END] success={str(success).lower()} steps={steps} "
+        f"score={score:.3f} rewards={rewards_str}",
+        flush=True,
+    )
+def _safe_json_load(text: str) -> dict[str, Any] | None:
+    text = text.strip()
+    if not text:
+        return None
+    text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL | re.IGNORECASE)
+    text = re.sub(
+        r"<reasoning>.*?</reasoning>",
+        "",
+        text,
+        flags=re.DOTALL | re.IGNORECASE,
+    )
+    if text.startswith("```"):
+        text = re.sub(r"^```(?:json)?\s*", "", text)
+        text = re.sub(r"\s*```$", "", text)
+    try:
+        payload = json.loads(text)
+    except json.JSONDecodeError:
+        match = re.search(r"\{.*\}", text, re.DOTALL)
+        if not match:
+            return None
+        try:
+            payload = json.loads(match.group(0))
+        except json.JSONDecodeError:
+            return None
+    return payload if isinstance(payload, dict) else None
+def _normalize_completion_content(raw_content: Any) -> str:
+    if raw_content is None:
+        return ""
+    if isinstance(raw_content, str):
+        return raw_content
+    if isinstance(raw_content, list):
+        parts: list[str] = []
+        for item in raw_content:
+            if isinstance(item, dict):
+                text = item.get("text")
+                if isinstance(text, str):
+                    parts.append(text)
+                continue
+            text = getattr(item, "text", None)
+            if isinstance(text, str):
+                parts.append(text)
+        return "\n".join(part for part in parts if part)
+    return str(raw_content)
+def _attempt_trace(
+    *,
+    completion: Any | None = None,
+    content: str = "",
+    payload: dict[str, Any] | None = None,
+    parsed_ok: bool = False,
+    failure_reason: str | None = None,
+    error: Exception | None = None,
+) -> dict[str, Any]:
+    trace: dict[str, Any] = {
+        "content": content,
+        "content_empty": not bool(content.strip()),
+        "json_detected": payload is not None,
+        "validation_passed": parsed_ok,
+        "failure_reason": failure_reason,
+    }
+    if error is not None:
+        trace["error_type"] = error.__class__.__name__
+        trace["error_message"] = str(error)
+    if completion is None:
+        return trace
+    trace["response_id"] = getattr(completion, "id", None)
+    choices = getattr(completion, "choices", None) or []
+    if choices:
+        choice = choices[0]
+        trace["finish_reason"] = getattr(choice, "finish_reason", None)
+        message = getattr(choice, "message", None)
+        if message is not None:
+            if hasattr(message, "model_dump"):
+                trace["raw_message"] = message.model_dump(
+                    mode="json", exclude_none=True
+                )
+            else:
+                trace["raw_message"] = str(message)
+    usage = getattr(completion, "usage", None)
+    if usage is not None and hasattr(usage, "model_dump"):
+        trace["usage"] = usage.model_dump(mode="json", exclude_none=True)
+    return trace
+def _query_model_json(
+    openai_client: OpenAI,
+    *,
+    system_prompt: str,
+    user_prompt: str,
+    validator: Callable[[dict[str, Any] | None], ParsedModelOutput | None],
+    retry_feedback: str,
+) -> tuple[ParsedModelOutput | None, list[dict[str, Any]]]:
+    messages = [
+        {"role": "system", "content": system_prompt},
+        {"role": "user", "content": user_prompt},
+    ]
+    attempts: list[dict[str, Any]] = []
+    for attempt in range(MAX_MODEL_ATTEMPTS):
+        expand_token_budget = bool(
+            attempts and attempts[-1].get("finish_reason") == "length"
+        )
+        try:
+            completion = openai_client.chat.completions.create(
+                model=MODEL_NAME,
+                messages=messages,
+                temperature=TEMPERATURE,
+                response_format={"type": "json_object"},
+                max_tokens=(RETRY_MAX_TOKENS if expand_token_budget else MAX_TOKENS),
+            )
+        except Exception as exc:
+            attempts.append(
+                _attempt_trace(
+                    failure_reason="request_error",
+                    error=exc,
+                )
+            )
+            if attempt == MAX_MODEL_ATTEMPTS - 1:
+                break
+            messages.append(
+                {
+                    "role": "user",
+                    "content": (
+                        "The previous request failed before a usable response was returned. "
+                        f"{retry_feedback} Reply with JSON only and no prose."
+                    ),
+                }
+            )
+            continue
+        choices = getattr(completion, "choices", None) or []
+        if not choices:
+            attempts.append(
+                _attempt_trace(
+                    completion=completion,
+                    failure_reason="no_choices",
+                )
+            )
+            if attempt == MAX_MODEL_ATTEMPTS - 1:
+                break
+            messages.append(
+                {
+                    "role": "user",
+                    "content": (
+                        "The previous reply did not contain any choices. "
+                        f"{retry_feedback} Reply with JSON only and no prose."
+                    ),
+                }
+            )
+            continue
+        message = choices[0].message
+        content = _normalize_completion_content(getattr(message, "content", None))
+        payload = _safe_json_load(content)
+        parsed = validator(payload)
+        if parsed is not None:
+            attempts.append(
+                _attempt_trace(
+                    completion=completion,
+                    content=content,
+                    payload=payload,
+                    parsed_ok=True,
+                )
+            )
+            return parsed, attempts
+        if not content.strip():
+            failure_reason = "empty_content"
+        elif payload is None:
+            failure_reason = "json_not_found"
+        else:
+            failure_reason = "schema_validation_failed"
+        attempts.append(
+            _attempt_trace(
+                completion=completion,
+                content=content,
+                payload=payload,
+                parsed_ok=False,
+                failure_reason=failure_reason,
+            )
+        )
+        if attempt == MAX_MODEL_ATTEMPTS - 1:
+            break
+        messages.extend(
+            [
+                {"role": "assistant", "content": content or "<empty_response>"},
+                {
+                    "role": "user",
+                    "content": (
+                        "Your previous reply could not be used. "
+                        f"{retry_feedback} Reply with JSON only and no prose."
+                    ),
+                },
+            ]
+        )
+    return None, attempts
+def _coerce_reason_codes(values: Any) -> list[ReasonCode]:
+    if isinstance(values, str):
+        raw_values = [values]
+    elif isinstance(values, list):
+        raw_values = values
+    else:
+        return []
+    codes: list[ReasonCode] = []
+    for value in raw_values:
+        if not isinstance(value, str):
+            continue
+        try:
+            code = ReasonCode(value)
+        except ValueError:
+            continue
+        if code not in codes:
+            codes.append(code)
+    return codes
+def _coerce_string_list(values: Any) -> list[str]:
+    if isinstance(values, str):
+        raw_values = [values]
+    elif isinstance(values, list):
+        raw_values = values
+    else:
+        return []
+    refs: list[str] = []
+    for value in raw_values:
+        if not isinstance(value, str):
+            continue
+        ref = value.strip()
+        if not ref or ref in refs:
+            continue
+        refs.append(ref)
+    return refs
+def _coerce_action_type(value: Any) -> ActionType | None:
+    if not isinstance(value, str):
+        return None
+    try:
+        return ActionType(value)
+    except ValueError:
+        return None
+def _coerce_match_strategy(value: Any) -> DuplicateMatchStrategy | None:
+    if not isinstance(value, str):
+        return None
+    normalized = value.strip().lower()
+    aliases = {
+        "exact_invoice_no": DuplicateMatchStrategy.EXACT_INVOICE_NUMBER,
+        "exact_invoice_number": DuplicateMatchStrategy.EXACT_INVOICE_NUMBER,
+        "invoice_number_exact": DuplicateMatchStrategy.EXACT_INVOICE_NUMBER,
+        "normalized_invoice_no": DuplicateMatchStrategy.NORMALIZED_INVOICE_NUMBER,
+        "normalized_invoice_number": DuplicateMatchStrategy.NORMALIZED_INVOICE_NUMBER,
+        "normalized_invoice": DuplicateMatchStrategy.NORMALIZED_INVOICE_NUMBER,
+        "vendor_amount_date": DuplicateMatchStrategy.VENDOR_AMOUNT_DATE,
+        "vendor_amount": DuplicateMatchStrategy.VENDOR_AMOUNT_DATE,
+        "vendor_invoice_amount": DuplicateMatchStrategy.VENDOR_AMOUNT_DATE,
+        "exact_vendor_invoice_amount": DuplicateMatchStrategy.VENDOR_AMOUNT_DATE,
+        "vendor_amount_and_date": DuplicateMatchStrategy.VENDOR_AMOUNT_DATE,
+    }
+    strategy = aliases.get(normalized)
+    if strategy is not None:
+        return strategy
+    try:
+        return DuplicateMatchStrategy(value)
+    except ValueError:
+        return None
+def _coerce_note_type(value: Any) -> NoteType | None:
+    if not isinstance(value, str):
+        return None
+    try:
+        return NoteType(value)
+    except ValueError:
+        return None
+def _coerce_route(value: Any) -> RouteTarget | None:
+    if not isinstance(value, str):
+        return None
+    try:
+        return RouteTarget(value)
+    except ValueError:
+        return None
+def _coerce_disposition(value: Any) -> Disposition | None:
+    if not isinstance(value, str):
+        return None
+    try:
+        return Disposition(value)
+    except ValueError:
+        return None
+def _coerce_payment_recommendation(
+    raw_header: dict[str, Any] | str | None,
+) -> PaymentRecommendation | None:
+    if isinstance(raw_header, str):
+        try:
+            return PaymentRecommendation(raw_header)
+        except ValueError:
+            return None
+    if not isinstance(raw_header, dict):
+        return None
+    for key in ("payment_recommendation", "header_recommendation", "recommendation"):
+        raw_value = raw_header.get(key)
+        if not isinstance(raw_value, str):
+            continue
+        try:
+            return PaymentRecommendation(raw_value)
+        except ValueError:
+            continue
+    disposition = _coerce_disposition(
+        raw_header.get("disposition") or raw_header.get("decision")
+    )
+    if disposition is None:
+        return None
+    return HEADER_DISPOSITION_MAP.get(disposition)
+def _extract_action_payload(payload: dict[str, Any] | None) -> dict[str, Any] | None:
+    if payload is None:
+        return None
+    if isinstance(payload.get("action"), dict):
+        raw_action = dict(payload["action"])
+        if "action_type" not in raw_action and isinstance(
+            payload.get("action_type"), str
+        ):
+            raw_action["action_type"] = payload["action_type"]
+        return raw_action
+    if isinstance(payload.get("args"), dict) and isinstance(payload.get("action"), str):
+        raw_action = dict(payload["args"])
+        raw_action.setdefault("action_type", payload["action"])
+        return raw_action
+    if isinstance(payload.get("arguments"), dict) and isinstance(
+        payload.get("action"), str
+    ):
+        raw_action = dict(payload["arguments"])
+        raw_action.setdefault("action_type", payload["action"])
+        return raw_action
+    return dict(payload)
+def _parse_action_payload(payload: dict[str, Any] | None) -> InvoiceOpsAction | None:
+    raw_action = _extract_action_payload(payload)
+    if raw_action is None:
+        return None
+    action_type = _coerce_action_type(
+        raw_action.get("action_type")
+        or raw_action.get("action")
+        or raw_action.get("type")
+        or raw_action.get("kind")
+        or raw_action.get("name")
+    )
+    if action_type is None:
+        return None
+    action_kwargs: dict[str, Any] = {
+        "action_type": action_type,
+    }
+    if action_type is ActionType.OPEN_ARTIFACT:
+        action_kwargs["artifact_id"] = (
+            raw_action.get("artifact_id")
+            or raw_action.get("artifact")
+            or raw_action.get("id")
+        )
+    elif action_type is ActionType.INSPECT_EXCEPTION:
+        action_kwargs["exception_id"] = (
+            raw_action.get("exception_id")
+            or raw_action.get("exception")
+            or raw_action.get("id")
+        )
+    elif action_type is ActionType.RUN_DUPLICATE_CHECK:
+        match_strategy = raw_action.get("match_strategy") or raw_action.get("strategy")
+        action_kwargs["match_strategy"] = _coerce_match_strategy(match_strategy)
+        if action_kwargs["match_strategy"] is None:
+            return None
+    elif action_type is ActionType.ADD_NOTE:
+        action_kwargs["note_type"] = _coerce_note_type(
+            raw_action.get("note_type") or raw_action.get("note_kind")
+        )
+        action_kwargs["reason_codes"] = _coerce_reason_codes(
+            raw_action.get("reason_codes") or raw_action.get("reason_code")
+        )
+        action_kwargs["evidence_refs"] = _coerce_string_list(
+            raw_action.get("evidence_refs")
+            or raw_action.get("evidence_ref")
+            or raw_action.get("refs")
+        )
+        action_kwargs["text"] = raw_action.get("text")
+    elif action_type is ActionType.SET_LINE_RESOLUTION:
+        action_kwargs["line_id"] = raw_action.get("line_id") or raw_action.get("line")
+        action_kwargs["disposition"] = _coerce_disposition(
+            raw_action.get("disposition") or raw_action.get("decision")
+        )
+        action_kwargs["reason_codes"] = _coerce_reason_codes(
+            raw_action.get("reason_codes") or raw_action.get("reason_code")
+        )
+        action_kwargs["evidence_refs"] = _coerce_string_list(
+            raw_action.get("evidence_refs")
+            or raw_action.get("evidence_ref")
+            or raw_action.get("refs")
+        )
+        action_kwargs["route_to"] = _coerce_route(
+            raw_action.get("route_to")
+            or raw_action.get("route")
+            or raw_action.get("escalation_target")
+        )
+    elif action_type is ActionType.SET_HEADER_RESOLUTION:
+        action_kwargs["payment_recommendation"] = _coerce_payment_recommendation(
+            raw_action
+        )
+        action_kwargs["reason_codes"] = _coerce_reason_codes(
+            raw_action.get("reason_codes") or raw_action.get("reason_code")
+        )
+        action_kwargs["evidence_refs"] = _coerce_string_list(
+            raw_action.get("evidence_refs")
+            or raw_action.get("evidence_ref")
+            or raw_action.get("refs")
+        )
+        action_kwargs["route_to"] = _coerce_route(
+            raw_action.get("route_to")
+            or raw_action.get("route")
+            or raw_action.get("escalation_target")
+        )
+    elif action_type is ActionType.SUBMIT_CASE:
+        action_kwargs["note_ids"] = _coerce_string_list(raw_action.get("note_ids"))
+        action_kwargs["line_resolution_ids"] = _coerce_string_list(
+            raw_action.get("line_resolution_ids")
+        )
+        header_resolution_id = raw_action.get("header_resolution_id")
+        if isinstance(header_resolution_id, str):
+            action_kwargs["header_resolution_id"] = header_resolution_id.strip()
+    try:
+        return InvoiceOpsAction(**action_kwargs)
+    except Exception:
+        return None
+def build_case_snapshot(
+    queue_card: QueueCard,
+    opened_artifacts: dict[str, Any],
+    inspected_exceptions: dict[str, ExceptionDetail],
+    duplicate_candidates: list[DuplicateCandidate],
+) -> dict[str, Any]:
+    def compact_text(value: str, *, limit: int = 180) -> str:
+        normalized = re.sub(r"\s+", " ", value.strip())
+        if len(normalized) <= limit:
+            return normalized
+        return f"{normalized[: limit - 3].rstrip()}..."
+    def compact_fields(fields: list[Any], *, limit: int = 10) -> dict[str, str]:
+        compact: dict[str, str] = {}
+        for field in fields[:limit]:
+            label = field.label.strip()
+            value = field.value.strip()
+            if not label or not value:
+                continue
+            compact[label] = compact_text(value, limit=120)
+        return compact
+    def compact_line_items(
+        line_items: list[Any], *, limit: int = 6
+    ) -> list[dict[str, Any]]:
+        compact_items: list[dict[str, Any]] = []
+        for item in line_items[:limit]:
+            compact_item: dict[str, Any] = {
+                "line_id": item.line_id,
+                "description": compact_text(item.description, limit=100),
+                "amount": item.amount,
+            }
+            if item.quantity is not None:
+                compact_item["quantity"] = item.quantity
+            if item.unit_price is not None:
+                compact_item["unit_price"] = item.unit_price
+            if item.status:
+                compact_item["status"] = compact_text(item.status, limit=60)
+            if item.notes:
+                compact_item["notes"] = compact_text(item.notes, limit=100)
+            compact_items.append(compact_item)
+        return compact_items
+    def compact_events(events: list[Any], *, limit: int = 8) -> list[dict[str, Any]]:
+        compact_events_list: list[dict[str, Any]] = []
+        for event in events[:limit]:
+            compact_event: dict[str, Any] = {
+                "type": event.event_type,
+                "date": event.event_date,
+                "description": compact_text(event.description, limit=120),
+            }
+            if event.quantity is not None:
+                compact_event["quantity"] = event.quantity
+            if event.amount is not None:
+                compact_event["amount"] = event.amount
+            if event.status:
+                compact_event["status"] = compact_text(event.status, limit=60)
+            compact_events_list.append(compact_event)
+        return compact_events_list
+    def compact_artifact(artifact: Any) -> dict[str, Any]:
+        compact_artifact_view: dict[str, Any] = {
+            "title": artifact.title,
+        }
+        if artifact.summary:
+            compact_artifact_view["summary"] = compact_text(artifact.summary)
+        fields = compact_fields(artifact.fields)
+        if fields:
+            compact_artifact_view["fields"] = fields
+        line_items = compact_line_items(artifact.line_items)
+        if line_items:
+            compact_artifact_view["line_items"] = line_items
+        events = compact_events(artifact.events)
+        if events:
+            compact_artifact_view["events"] = events
+        return compact_artifact_view
+    def compact_exception(exception: ExceptionDetail) -> dict[str, Any]:
+        compact_exception_view: dict[str, Any] = {
+            "type": exception.exception_type.value,
+            "severity": exception.severity.value,
+            "headline": compact_text(exception.headline, limit=120),
+        }
+        if exception.impacted_line_ids:
+            compact_exception_view["impacted_line_ids"] = exception.impacted_line_ids
+        if exception.short_description:
+            compact_exception_view["summary"] = compact_text(
+                exception.short_description,
+                limit=140,
+            )
+        fields = compact_fields(exception.fields, limit=8)
+        if fields:
+            compact_exception_view["facts"] = fields
+        if exception.reviewer_guidance:
+            compact_exception_view["guidance"] = compact_text(
+                exception.reviewer_guidance,
+                limit=160,
+            )
+        return compact_exception_view
+    def compact_duplicate(candidate: DuplicateCandidate) -> dict[str, Any]:
+        return {
+            "candidate_id": candidate.candidate_id,
+            "invoice_number": candidate.invoice_number,
+            "invoice_date": candidate.invoice_date,
+            "gross_amount": candidate.gross_amount,
+            "status": candidate.status,
+            "match_basis": compact_text(candidate.match_basis, limit=80),
+            "overlap_summary": compact_text(candidate.overlap_summary, limit=140),
+        }
+    return {
+        "queue_card": {
+            "vendor_name": queue_card.vendor_name,
+            "vendor_id": queue_card.vendor_id,
+            "invoice_number": queue_card.invoice_number,
+            "invoice_date": queue_card.invoice_date,
+            "invoice_total": queue_card.invoice_total,
+            "currency": queue_card.currency,
+            "po_number": queue_card.po_number,
+            "risk_flags": [flag.value for flag in queue_card.risk_flags],
+            "summary": compact_text(queue_card.summary, limit=160),
+        },
+        "artifacts": {
+            artifact.artifact_type.value: compact_artifact(artifact)
+            for artifact in opened_artifacts.values()
+        },
+        "exceptions": [
+            compact_exception(exception) for exception in inspected_exceptions.values()
+        ],
+        "duplicate_candidates": [
+            compact_duplicate(candidate) for candidate in duplicate_candidates
+        ],
+    }
+def update_memory(
+    memory: ObservationMemory,
+    observation: InvoiceOpsObservation,
+) -> None:
+    if observation.opened_artifact is not None:
+        memory.opened_artifacts[observation.opened_artifact.artifact_id] = (
+            observation.opened_artifact
+        )
+    if observation.inspected_exception is not None:
+        memory.inspected_exceptions[observation.inspected_exception.exception_id] = (
+            observation.inspected_exception
+        )
+    if observation.duplicate_candidates:
+        memory.duplicate_candidates = observation.duplicate_candidates
+def build_observation_snapshot(
+    observation: InvoiceOpsObservation,
+    memory: ObservationMemory,
+) -> dict[str, Any]:
+    queue_card = observation.queue_card
+    assert queue_card is not None
+    base_snapshot = build_case_snapshot(
+        queue_card,
+        memory.opened_artifacts,
+        memory.inspected_exceptions,
+        memory.duplicate_candidates,
+    )
+    base_snapshot["message"] = observation.message
+    base_snapshot["progress"] = observation.progress.model_dump(mode="json")
+    base_snapshot["known_refs"] = observation.known_refs
+    base_snapshot["available_artifacts"] = [
+        artifact.model_dump(mode="json") for artifact in observation.available_artifacts
+    ]
+    base_snapshot["visible_exceptions"] = [
+        exception.model_dump(mode="json")
+        for exception in observation.visible_exceptions
+    ]
+    base_snapshot["current_focus"] = {
+        "opened_artifact_id": (
+            observation.opened_artifact.artifact_id
+            if observation.opened_artifact is not None
+            else None
+        ),
+        "inspected_exception_id": (
+            observation.inspected_exception.exception_id
+            if observation.inspected_exception is not None
+            else None
+        ),
+    }
+    base_snapshot["draft_state"] = {
+        "line_resolutions": [
+            line_resolution.model_dump(mode="json")
+            for line_resolution in observation.draft_line_resolutions
+        ],
+        "header_resolution": (
+            observation.draft_header_resolution.model_dump(mode="json")
+            if observation.draft_header_resolution is not None
+            else None
+        ),
+        "notes": [note.model_dump(mode="json") for note in observation.draft_notes],
+    }
+    return base_snapshot
+def build_action_prompt(
+    observation: InvoiceOpsObservation,
+    memory: ObservationMemory,
+) -> str:
+    snapshot = build_observation_snapshot(observation, memory)
+    return (
+        "You are controlling an AP invoice exception environment one action at a time.\n"
+        "Return exactly one JSON object for the single best next action. No prose. No markdown. No multi-action plans.\n"
+        "Do not assume you have seen artifacts or exception details that are not in the observation snapshot.\n"
+        "Use open_artifact, inspect_exception, and run_duplicate_check to gather evidence before deciding.\n"
+        "Only use evidence_refs from known_refs. Invalid refs will be penalized.\n"
+        "Only add notes or resolutions when you have enough visible evidence to support them.\n"
+        "route_to means the next owner or follow-up queue for the action. Use it whenever another queue must act, including hold actions that still need follow-up.\n"
+        "Line resolutions describe content/payment readiness for each line. Header resolution describes whether any payment can be released now.\n"
+        "A real case-level blocker can justify hold_full_invoice or escalate_case even when some lines are approved.\n"
+        "Submit only when the current draft state is coherent or when no better action remains.\n\n"
+        f"Allowed action_type values: {[action.value for action in ActionType]}\n"
+        f"Allowed match_strategy values: {[strategy.value for strategy in DuplicateMatchStrategy]}\n"
+        f"Allowed disposition values: {[disposition.value for disposition in Disposition]}\n"
+        f"Allowed payment_recommendation values: {[recommendation.value for recommendation in PaymentRecommendation]}\n"
+        f"Allowed route_to values: {[route.value for route in RouteTarget]}\n"
+        f"Allowed note_type values: {[note_type.value for note_type in NoteType]}\n"
+        f"Allowed reason_codes values: {[reason.value for reason in ReasonCode]}\n"
+        "Action JSON templates (replace angle-bracket placeholders with real values from the observation; omit optional fields when unused):\n"
+        '{"action_type":"open_artifact","artifact_id":"<artifact_id>"}\n'
+        '{"action_type":"inspect_exception","exception_id":"<exception_id>"}\n'
+        '{"action_type":"run_duplicate_check","match_strategy":"normalized_invoice_no"}\n'
+        '{"action_type":"set_line_resolution","line_id":"<line_id>","disposition":"<disposition>","reason_codes":["<reason_code>"],"evidence_refs":["<known_ref>"],"route_to":"<optional_route_target>"}\n'
+        '{"action_type":"set_header_resolution","payment_recommendation":"<payment_recommendation>","reason_codes":["<reason_code>"],"evidence_refs":["<known_ref>"],"route_to":"<optional_route_target>"}\n'
+        '{"action_type":"add_note","note_type":"<note_type>","reason_codes":["<reason_code>"],"evidence_refs":["<known_ref>"],"text":"<brief_handoff_note>"}\n'
+        '{"action_type":"submit_case"}\n\n'
+        f"Observation snapshot:\n{json.dumps(snapshot, indent=2)}"
+    )
+def request_action_from_model(
+    openai_client: OpenAI,
+    *,
+    observation: InvoiceOpsObservation,
+    memory: ObservationMemory,
+) -> tuple[InvoiceOpsAction | None, list[dict[str, Any]]]:
+    return _query_model_json(
+        openai_client,
+        system_prompt=(
+            "You are a deterministic AP invoice reviewer acting in an environment. "
+            "Return exactly one valid JSON action and nothing else."
+        ),
+        user_prompt=build_action_prompt(observation, memory),
+        validator=_parse_action_payload,
+        retry_feedback=(
+            "Return exactly one action object with action_type and only the fields required for that action. "
+            'Examples: {"action_type":"open_artifact","artifact_id":"art-invoice"} '
+            'or {"action_type":"submit_case"}. '
+            "Do not output a plan or multiple actions."
+        ),
+    )
+def run_task(
+    env: Any,
+    openai_client: OpenAI,
+    task_id: TaskId,
+    trace: EpisodeTrace,
+) -> dict[str, Any]:
+    try:
+        reset_result = env.reset(task_id=task_id.value)
+        observation = reset_result.observation
+        initial_queue_card = observation.queue_card
+        memory = ObservationMemory()
+        update_memory(memory, observation)
+        model_attempts: list[dict[str, Any]] = []
+        action_history: list[dict[str, Any]] = []
+        used_fallback = False
+        decision_parsed = True
+        failure_reason: str | None = None
+        while not observation.done:
+            action, attempts = request_action_from_model(
+                openai_client,
+                observation=observation,
+                memory=memory,
+            )
+            model_attempts.append(
+                {
+                    "turn_index": len(model_attempts) + 1,
+                    "attempts": attempts,
+                }
+            )
+            if action is None:
+                used_fallback = True
+                decision_parsed = False
+                failure_reason = (
+                    attempts[-1]["failure_reason"] if attempts else "no_attempt"
+                )
+                action = InvoiceOpsAction(action_type=ActionType.SUBMIT_CASE)
+                model_attempts[-1]["fallback_action"] = action.model_dump(
+                    mode="json",
+                    exclude_none=True,
+                )
+            previous_invalid_actions = observation.progress.invalid_actions
+            result = env.step(action)
+            reward = float(result.reward or 0.0)
+            trace.steps_taken += 1
+            trace.rewards.append(reward)
+            log_step(
+                trace.steps_taken,
+                format_action_for_log(action),
+                reward,
+                bool(result.done),
+                _extract_step_error(
+                    result.observation,
+                    previous_invalid_actions=previous_invalid_actions,
+                ),
+            )
+            action_history.append(
+                {
+                    "step": trace.steps_taken,
+                    "action": action.model_dump(mode="json", exclude_none=True),
+                    "reward": reward,
+                    "done": bool(result.done),
+                    "message": result.observation.message,
+                }
+            )
+            observation = result.observation
+            update_memory(memory, observation)
+        raw_score = float(observation.episode_score or 0.0)
+        score = strict_task_score(raw_score, used_fallback=used_fallback)
+        return {
+            "task_id": task_id.value,
+            "queue_card": (
+                initial_queue_card.model_dump(mode="json")
+                if initial_queue_card is not None
+                else None
+            ),
+            "decision_parsed": decision_parsed,
+            "used_fallback": used_fallback,
+            "failure_reason": failure_reason,
+            "parsed_line_count": len(observation.draft_line_resolutions),
+            "parsed_header_resolution": observation.draft_header_resolution is not None,
+            "model_attempts": model_attempts,
+            "action_history": action_history,
+            "raw_score": raw_score,
+            "score": score,
+            "steps_used": trace.steps_taken,
+            "reward_trace": trace.rewards,
+            "submission_report": (
+                observation.submission_report.model_dump(mode="json")
+                if observation.submission_report is not None
+                else None
+            ),
+            "error": None,
+        }
+    except Exception as exc:
+        return {
+            "task_id": task_id.value,
+            "queue_card": None,
+            "decision_parsed": False,
+            "used_fallback": False,
+            "failure_reason": "task_execution_error",
+            "parsed_line_count": 0,
+            "parsed_header_resolution": False,
+            "model_attempts": [],
+            "action_history": [],
+            "raw_score": 0.0,
+            "score": 0.0,
+            "steps_used": trace.steps_taken,
+            "reward_trace": trace.rewards,
+            "submission_report": None,
+            "error": str(exc),
+        }
+def main() -> None:
+    api_key, api_key_source = resolve_api_key()
+    api_base_url = API_BASE_URL
+    if not api_key:
+        raise RuntimeError("Set HF_TOKEN before running inference.py.")
+    openai_client = OpenAI(api_key=api_key, base_url=api_base_url)
+    run_id, output_path = build_output_path(MODEL_NAME)
+    results: list[dict[str, Any]] = []
+    for task_id in TASKS:
+        trace = EpisodeTrace()
+        log_start(task=task_id.value, env=BENCHMARK, model=MODEL_NAME)
+        task_result: dict[str, Any] | None = None
+        try:
+            with InvoiceOpsEnv(base_url=ENV_URL).sync() as env:
+                task_result = run_task(env, openai_client, task_id, trace)
+        finally:
+            score = float(task_result["score"]) if task_result is not None else 0.0
+            success = task_result is not None and task_result.get("error") is None
+            log_end(
+                success=success,
+                steps=trace.steps_taken,
+                score=score,
+                rewards=trace.rewards,
+            )
+        assert task_result is not None
+        results.append(task_result)
+        sys.stderr.write(
+            f"{task_id.value}: score={task_result['score']:.4f} "
+            f"raw_score={task_result.get('raw_score', task_result['score']):.4f} "
+            f"fallback={str(task_result['used_fallback']).lower()}\n"
+        )
+    mean_score = sum(result["score"] for result in results) / len(results)
+    raw_mean_score = sum(
+        result.get("raw_score", result["score"]) for result in results
+    ) / len(results)
+    payload = {
+        "run_id": run_id,
+        "model_name": MODEL_NAME,
+        "env_url": ENV_URL,
+        "api_base_url": api_base_url,
+        "api_key_source": api_key_source,
+        "raw_mean_score": round(raw_mean_score, 4),
+        "mean_score": round(mean_score, 4),
+        "strict_baseline_scoring": _env_flag("STRICT_BASELINE_SCORING", True),
+        "results": results,
+    }
+    output_path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
+    sys.stderr.write(
+        f"mean_score={mean_score:.4f} raw_mean_score={raw_mean_score:.4f}\n"
+    )
+    sys.stderr.write(f"wrote={output_path}\n")
+if __name__ == "__main__":
+    main()

models.py ADDED Viewed

	@@ -0,0 +1,583 @@

+"""Typed models for the InvoiceOps environment."""
+from __future__ import annotations
+from enum import Enum
+from openenv.core.env_server.types import Action, Observation, State
+from pydantic import BaseModel, ConfigDict, Field, model_validator
+class Model(BaseModel):
+    model_config = ConfigDict(
+        extra="forbid",
+        validate_assignment=True,
+        arbitrary_types_allowed=True,
+    )
+class TaskId(str, Enum):
+    EASY = "easy"
+    MEDIUM = "medium"
+    MEDIUM_PLUS = "medium_plus"
+    HARD = "hard"
+class ActionType(str, Enum):
+    OPEN_ARTIFACT = "open_artifact"
+    INSPECT_EXCEPTION = "inspect_exception"
+    RUN_DUPLICATE_CHECK = "run_duplicate_check"
+    ADD_NOTE = "add_note"
+    SET_LINE_RESOLUTION = "set_line_resolution"
+    SET_HEADER_RESOLUTION = "set_header_resolution"
+    SUBMIT_CASE = "submit_case"
+class ArtifactType(str, Enum):
+    INVOICE_PACKET = "invoice_packet"
+    PURCHASE_ORDER = "purchase_order"
+    RECEIPT_LOG = "receipt_log"
+    VENDOR_MASTER = "vendor_master"
+    POLICY_CARD = "policy_card"
+    APPROVAL_ARTIFACT = "approval_artifact"
+    INVOICE_HISTORY = "invoice_history"
+class ExceptionType(str, Enum):
+    RECEIPT_QUANTITY_VARIANCE = "receipt_quantity_variance"
+    NON_PO_MISSING_APPROVAL = "non_po_missing_approval"
+    POSSIBLE_DUPLICATE = "possible_duplicate"
+    PRICE_VARIANCE = "price_variance"
+    CUMULATIVE_BILLING_VARIANCE = "cumulative_billing_variance"
+    TAX_VARIANCE = "tax_variance"
+    PAYMENT_TERMS_MISMATCH = "payment_terms_mismatch"
+class Severity(str, Enum):
+    LOW = "low"
+    MEDIUM = "medium"
+    HIGH = "high"
+    CRITICAL = "critical"
+class DuplicateMatchStrategy(str, Enum):
+    EXACT_INVOICE_NUMBER = "exact_invoice_no"
+    NORMALIZED_INVOICE_NUMBER = "normalized_invoice_no"
+    VENDOR_AMOUNT_DATE = "vendor_amount_date"
+class NoteType(str, Enum):
+    ISSUE_SUMMARY = "issue_summary"
+    ESCALATION_REQUEST = "escalation_request"
+    REVIEW_SUMMARY = "review_summary"
+class Disposition(str, Enum):
+    APPROVE = "approve"
+    HOLD = "hold"
+    REJECT = "reject"
+    ESCALATE = "escalate"
+class PaymentRecommendation(str, Enum):
+    RELEASE_APPROVED_LINES = "release_approved_lines"
+    HOLD_FULL_INVOICE = "hold_full_invoice"
+    REJECT_FULL_INVOICE = "reject_full_invoice"
+    ESCALATE_CASE = "escalate_case"
+class DecisionBand(str, Enum):
+    BEST = "best"
+    SAFE_SUBOPTIMAL = "safe_suboptimal"
+    WRONG = "wrong"
+    UNSAFE = "unsafe"
+class RouteTarget(str, Enum):
+    RECEIVING = "receiving"
+    REQUESTER = "requester"
+    PROCUREMENT = "procurement"
+    TAX = "tax"
+    AP_MANAGER = "ap_manager"
+class RiskFlag(str, Enum):
+    PO_INVOICE = "po_invoice"
+    RECEIPT_VARIANCE = "receipt_variance"
+    PARTIAL_RECEIPT = "partial_receipt"
+    PRICE_VARIANCE = "price_variance"
+    NON_PO_INVOICE = "non_po_invoice"
+    MISSING_APPROVAL = "missing_approval"
+    POSSIBLE_DUPLICATE = "possible_duplicate"
+    CUMULATIVE_BILLING_RISK = "cumulative_billing_risk"
+    TAX_VARIANCE = "tax_variance"
+    TERMS_MISMATCH = "terms_mismatch"
+class ReasonCode(str, Enum):
+    MATCHED_TO_PO_AND_RECEIPT = "matched_to_po_and_receipt"
+    RECEIPT_NOT_CONFIRMED = "receipt_not_confirmed"
+    PARTIAL_RECEIPT_PENDING = "partial_receipt_pending"
+    PRICE_EXCEEDS_PO_RATE = "price_exceeds_po_rate"
+    NON_PO_APPROVAL_MISSING = "non_po_approval_missing"
+    POSSIBLE_DUPLICATE_REVIEW = "possible_duplicate_review"
+    CUMULATIVE_BILLING_EXCEEDS_PO = "cumulative_billing_exceeds_po"
+    TAX_AMOUNT_MISMATCH = "tax_amount_mismatch"
+    PAYMENT_TERMS_MISMATCH = "payment_terms_mismatch"
+    SAFE_TO_PAY = "safe_to_pay"
+    ESCALATE_FOR_MANUAL_REVIEW = "escalate_for_manual_review"
+class ArtifactField(Model):
+    label: str = Field(..., description="Field label shown to the reviewer")
+    value: str = Field(..., description="Rendered field value")
+class ArtifactLineItem(Model):
+    line_id: str = Field(..., description="Stable line identifier")
+    description: str = Field(..., description="Line description")
+    quantity: float | None = Field(default=None, description="Line quantity")
+    unit_price: float | None = Field(default=None, description="Unit price")
+    amount: float | None = Field(
+        default=None,
+        description="Extended amount for the line when the artifact exposes it",
+    )
+    status: str = Field(default="", description="Operational status")
+    notes: str = Field(default="", description="Short line note")
+class ArtifactEvent(Model):
+    event_id: str = Field(..., description="Stable event identifier")
+    event_type: str = Field(..., description="Event type label")
+    event_date: str = Field(..., description="Event date in ISO format")
+    description: str = Field(..., description="Human readable event description")
+    quantity: float | None = Field(default=None, description="Event quantity")
+    amount: float | None = Field(default=None, description="Event amount")
+    status: str = Field(default="", description="Event status")
+class ArtifactReference(Model):
+    artifact_id: str = Field(..., description="Artifact identifier")
+    artifact_type: ArtifactType = Field(..., description="Artifact type")
+    title: str = Field(..., description="Artifact title shown in the UI")
+class ArtifactView(ArtifactReference):
+    summary: str = Field(default="", description="Short artifact summary")
+    fields: list[ArtifactField] = Field(
+        default_factory=list,
+        description="Structured key-value pairs exposed by the artifact",
+    )
+    line_items: list[ArtifactLineItem] = Field(
+        default_factory=list,
+        description="Line items exposed by the artifact",
+    )
+    events: list[ArtifactEvent] = Field(
+        default_factory=list,
+        description="Timeline or ledger events exposed by the artifact",
+    )
+    related_refs: list[str] = Field(
+        default_factory=list,
+        description="Related artifact or issue identifiers",
+    )
+class QueueCard(Model):
+    case_id: str = Field(..., description="Stable case identifier")
+    vendor_name: str = Field(..., description="Vendor display name")
+    vendor_id: str = Field(..., description="Vendor identifier")
+    invoice_number: str = Field(..., description="Invoice number")
+    invoice_date: str = Field(..., description="Invoice date in ISO format")
+    invoice_total: float = Field(..., description="Gross invoice total")
+    currency: str = Field(..., description="Invoice currency")
+    po_number: str | None = Field(default=None, description="PO number when present")
+    risk_flags: list[RiskFlag] = Field(
+        default_factory=list,
+        description="Compact risk hints visible from the queue",
+    )
+    summary: str = Field(default="", description="Short queue summary")
+class ExceptionSummary(Model):
+    exception_id: str = Field(..., description="Stable exception identifier")
+    exception_type: ExceptionType = Field(..., description="Exception category")
+    severity: Severity = Field(..., description="Exception severity")
+    headline: str = Field(..., description="Queue-visible exception stub headline")
+    impacted_line_ids: list[str] = Field(
+        default_factory=list,
+        description="Invoice lines directly impacted by the exception",
+    )
+    short_description: str = Field(
+        default="",
+        description="Queue-safe hint shown before inspection",
+    )
+class ExceptionDetail(ExceptionSummary):
+    fields: list[ArtifactField] = Field(
+        default_factory=list,
+        description="Structured exception facts shown after inspection",
+    )
+    reviewer_guidance: str = Field(
+        default="",
+        description="Short workflow guidance exposed after inspection",
+    )
+class DuplicateCandidate(Model):
+    candidate_id: str = Field(..., description="Ledger invoice identifier")
+    vendor_name: str = Field(..., description="Vendor display name")
+    invoice_number: str = Field(..., description="Prior or pending invoice number")
+    invoice_date: str = Field(..., description="Candidate invoice date")
+    gross_amount: float = Field(..., description="Candidate gross amount")
+    status: str = Field(..., description="Current ledger or workflow status")
+    match_basis: str = Field(..., description="Why the invoice was matched")
+    overlap_summary: str = Field(..., description="Human readable overlap summary")
+    supported_match_strategies: list[DuplicateMatchStrategy] = Field(
+        default_factory=list,
+        description="Match strategies that surface this candidate",
+    )
+class CaseNote(Model):
+    note_id: str = Field(..., description="Stable note identifier")
+    note_type: NoteType = Field(..., description="Workflow note category")
+    reason_codes: list[ReasonCode] = Field(
+        default_factory=list,
+        description="Structured reason codes captured in the note",
+    )
+    evidence_refs: list[str] = Field(
+        default_factory=list,
+        description="Artifact or exception references cited in the note",
+    )
+    text: str = Field(
+        ...,
+        description="Free-form note text retained for auditability, not prose-quality scoring",
+    )
+    saved_at_step: int = Field(..., ge=0, description="Step where the note was saved")
+class LineResolution(Model):
+    resolution_id: str = Field(..., description="Stable resolution identifier")
+    line_id: str = Field(..., description="Invoice line identifier")
+    disposition: Disposition = Field(..., description="Line disposition")
+    reason_codes: list[ReasonCode] = Field(
+        default_factory=list,
+        description="Structured reason codes supporting the line disposition",
+    )
+    evidence_refs: list[str] = Field(
+        default_factory=list,
+        description="Artifact or exception references cited by the reviewer",
+    )
+    route_to: RouteTarget | None = Field(
+        default=None,
+        description="Next owner or follow-up queue for the line when another team must act",
+    )
+    saved_at_step: int = Field(
+        ...,
+        ge=0,
+        description="Step where the line disposition was saved",
+    )
+class HeaderResolution(Model):
+    resolution_id: str = Field(..., description="Stable header resolution identifier")
+    payment_recommendation: PaymentRecommendation = Field(
+        ...,
+        description=(
+            "Header-level payment recommendation governing whether any payment can "
+            "be released now, including case-level blockers that may override "
+            "otherwise approved lines"
+        ),
+    )
+    reason_codes: list[ReasonCode] = Field(
+        default_factory=list,
+        description="Structured reason codes for the header recommendation",
+    )
+    evidence_refs: list[str] = Field(
+        default_factory=list,
+        description="Artifact or exception references cited by the reviewer",
+    )
+    route_to: RouteTarget | None = Field(
+        default=None,
+        description="Next owner or follow-up queue for the case when another team must act",
+    )
+    saved_at_step: int = Field(
+        ...,
+        ge=0,
+        description="Step where the header recommendation was saved",
+    )
+class LineScoreReport(Model):
+    line_id: str
+    line_score: float
+    disposition_score: float
+    reason_score: float
+    route_score: float
+    evidence_score: float
+    accepted_dispositions: list[Disposition] = Field(default_factory=list)
+class HeaderScoreReport(Model):
+    header_score: float
+    recommendation_score: float
+    reason_score: float
+    route_score: float
+    evidence_score: float
+    accepted_recommendations: list[PaymentRecommendation] = Field(default_factory=list)
+class IssueNoteReport(Model):
+    issue_id: str
+    note_score: float
+    reason_score: float
+    evidence_score: float
+class SubmissionReport(Model):
+    decision_band: DecisionBand
+    total_score: float = Field(..., ge=0.0, le=1.0)
+    core_decision_score: float = Field(..., ge=0.0, le=1.0)
+    reason_quality_score: float = Field(..., ge=0.0, le=1.0)
+    auxiliary_score: float = Field(..., ge=0.0, le=1.0)
+    resolution_score: float = Field(..., ge=0.0, le=1.0)
+    evidence_score: float = Field(..., ge=0.0, le=1.0)
+    documentation_score: float = Field(..., ge=0.0, le=1.0)
+    efficiency_score: float = Field(..., ge=0.0, le=1.0)
+    safety_cap_applied: float | None = Field(
+        default=None,
+        description="Cap value applied because the action set was unsafe",
+    )
+    unsafe_findings: list[str] = Field(
+        default_factory=list,
+        description="Unsafe findings surfaced by the grader",
+    )
+    line_reports: list[LineScoreReport] = Field(default_factory=list)
+    header_report: HeaderScoreReport | None = None
+    note_reports: list[IssueNoteReport] = Field(default_factory=list)
+class Progress(Model):
+    steps_used: int = Field(..., ge=0, description="Steps used in the episode")
+    steps_remaining: int = Field(..., ge=0, description="Steps remaining")
+    opened_artifacts: int = Field(..., ge=0, description="Unique artifacts opened")
+    inspected_exceptions: int = Field(
+        ...,
+        ge=0,
+        description="Unique exceptions inspected",
+    )
+    notes_count: int = Field(..., ge=0, description="Saved notes")
+    line_resolutions: int = Field(..., ge=0, description="Saved line resolutions")
+    duplicate_checks_run: int = Field(
+        ...,
+        ge=0,
+        description="Duplicate check actions executed",
+    )
+    invalid_actions: int = Field(..., ge=0, description="Invalid actions taken")
+    redundant_actions: int = Field(..., ge=0, description="Redundant actions taken")
+    submitted: bool = Field(default=False, description="Whether the case is submitted")
+class InvoiceOpsObservation(Observation):
+    message: str = Field(default="", description="Short environment message")
+    task_id: TaskId | None = Field(default=None, description="Task bucket for the case")
+    scenario_id: str | None = Field(default=None, description="Scenario identifier")
+    title: str = Field(default="", description="Case title")
+    description: str = Field(default="", description="Case description")
+    queue_card: QueueCard | None = Field(
+        default=None,
+        description="Queue-level summary of the current invoice case",
+    )
+    available_artifacts: list[ArtifactReference] = Field(
+        default_factory=list,
+        description="Artifacts currently available to the reviewer",
+    )
+    opened_artifact: ArtifactView | None = Field(
+        default=None,
+        description="Most recently opened artifact",
+    )
+    visible_exceptions: list[ExceptionSummary] = Field(
+        default_factory=list,
+        description="Queue-visible exception stubs visible before detailed inspection",
+    )
+    inspected_exception: ExceptionDetail | None = Field(
+        default=None,
+        description="Most recently inspected full exception detail",
+    )
+    duplicate_candidates: list[DuplicateCandidate] = Field(
+        default_factory=list,
+        description="Candidates surfaced by duplicate search",
+    )
+    draft_notes: list[CaseNote] = Field(
+        default_factory=list,
+        description="Saved case notes",
+    )
+    draft_line_resolutions: list[LineResolution] = Field(
+        default_factory=list,
+        description="Draft line resolutions saved so far",
+    )
+    draft_header_resolution: HeaderResolution | None = Field(
+        default=None,
+        description="Draft header recommendation if saved",
+    )
+    submission_report: SubmissionReport | None = Field(
+        default=None,
+        description="Deterministic grading report after submission",
+    )
+    progress: Progress = Field(
+        default_factory=lambda: Progress(
+            steps_used=0,
+            steps_remaining=0,
+            opened_artifacts=0,
+            inspected_exceptions=0,
+            notes_count=0,
+            line_resolutions=0,
+            duplicate_checks_run=0,
+            invalid_actions=0,
+            redundant_actions=0,
+            submitted=False,
+        ),
+        description="Episode progress counters",
+    )
+    known_refs: list[str] = Field(
+        default_factory=list,
+        description="Evidence refs that can be cited safely in notes or resolutions",
+    )
+    episode_score: float | None = Field(
+        default=None,
+        description="Final episode score when the case is done",
+    )
+class InvoiceOpsState(State):
+    task_id: TaskId | None = Field(default=None, description="Task bucket")
+    scenario_id: str | None = Field(default=None, description="Scenario identifier")
+    case_id: str | None = Field(default=None, description="Case identifier")
+    current_artifact_id: str | None = Field(
+        default=None,
+        description="Most recently opened artifact",
+    )
+    submitted: bool = Field(default=False, description="Whether the case is submitted")
+    step_limit: int = Field(default=0, ge=0, description="Episode step budget")
+    duplicate_checks_run: int = Field(
+        default=0,
+        ge=0,
+        description="Number of duplicate checks executed",
+    )
+    invalid_actions: int = Field(
+        default=0,
+        ge=0,
+        description="Number of invalid actions taken",
+    )
+    redundant_actions: int = Field(
+        default=0,
+        ge=0,
+        description="Number of redundant actions taken",
+    )
+class InvoiceOpsAction(Action):
+    action_type: ActionType = Field(..., description="Action to execute")
+    artifact_id: str | None = Field(default=None, description="Artifact to open")
+    exception_id: str | None = Field(default=None, description="Exception to inspect")
+    match_strategy: DuplicateMatchStrategy | None = Field(
+        default=None,
+        description="Duplicate search strategy to run",
+    )
+    note_type: NoteType | None = Field(default=None, description="Case note type")
+    reason_codes: list[ReasonCode] = Field(
+        default_factory=list,
+        description="Structured reason codes carried by the action",
+    )
+    evidence_refs: list[str] = Field(
+        default_factory=list,
+        description="Artifact or exception refs supporting the action",
+    )
+    text: str | None = Field(default=None, description="Free-form note text")
+    line_id: str | None = Field(default=None, description="Invoice line identifier")
+    disposition: Disposition | None = Field(default=None, description="Line outcome")
+    payment_recommendation: PaymentRecommendation | None = Field(
+        default=None,
+        description="Header-level payment recommendation",
+    )
+    route_to: RouteTarget | None = Field(
+        default=None,
+        description="Next owner or follow-up queue for the action, when applicable",
+    )
+    note_ids: list[str] = Field(
+        default_factory=list,
+        description="Optional note identifiers to submit",
+    )
+    line_resolution_ids: list[str] = Field(
+        default_factory=list,
+        description="Optional line resolution identifiers to submit",
+    )
+    header_resolution_id: str | None = Field(
+        default=None,
+        description="Optional header resolution identifier to submit",
+    )
+    @model_validator(mode="after")
+    def validate_action_fields(self) -> "InvoiceOpsAction":
+        action_type = self.action_type
+        if action_type is ActionType.OPEN_ARTIFACT:
+            if not self.artifact_id:
+                raise ValueError("artifact_id is required for open_artifact")
+            return self
+        if action_type is ActionType.INSPECT_EXCEPTION:
+            if not self.exception_id:
+                raise ValueError("exception_id is required for inspect_exception")
+            return self
+        if action_type is ActionType.RUN_DUPLICATE_CHECK:
+            if self.match_strategy is None:
+                raise ValueError("match_strategy is required for run_duplicate_check")
+            return self
+        if action_type is ActionType.ADD_NOTE:
+            if self.note_type is None:
+                raise ValueError("note_type is required for add_note")
+            if not self.reason_codes:
+                raise ValueError("reason_codes are required for add_note")
+            if not self.evidence_refs:
+                raise ValueError("evidence_refs are required for add_note")
+            if not self.text or not self.text.strip():
+                raise ValueError("text is required for add_note")
+            return self
+        if action_type is ActionType.SET_LINE_RESOLUTION:
+            if not self.line_id:
+                raise ValueError("line_id is required for set_line_resolution")
+            if self.disposition is None:
+                raise ValueError("disposition is required for set_line_resolution")
+            if not self.reason_codes:
+                raise ValueError("reason_codes are required for set_line_resolution")
+            if not self.evidence_refs:
+                raise ValueError("evidence_refs are required for set_line_resolution")
+            if self.disposition is Disposition.ESCALATE and self.route_to is None:
+                raise ValueError("route_to is required when escalating a line")
+            return self
+        if action_type is ActionType.SET_HEADER_RESOLUTION:
+            if self.payment_recommendation is None:
+                raise ValueError(
+                    "payment_recommendation is required for set_header_resolution"
+                )
+            if not self.reason_codes:
+                raise ValueError("reason_codes are required for set_header_resolution")
+            if not self.evidence_refs:
+                raise ValueError("evidence_refs are required for set_header_resolution")
+            if (
+                self.payment_recommendation is PaymentRecommendation.ESCALATE_CASE
+                and self.route_to is None
+            ):
+                raise ValueError("route_to is required when escalating the case")
+            return self
+        if action_type is ActionType.SUBMIT_CASE:
+            return self
+        raise ValueError(f"Unsupported action_type: {action_type}")

openenv.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+spec_version: 1
+name: invoiceops_env
+type: space
+runtime: fastapi
+app: server.app:app
+port: 8000

pyproject.toml ADDED Viewed

	@@ -0,0 +1,34 @@

+[build-system]
+requires = ["setuptools>=45", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "openenv-invoiceops_env"
+version = "0.1.0"
+description = "AP invoice exception handling environment for OpenEnv"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "openenv-core[core]>=0.2.2,<0.3",
+    "fastapi>=0.115.0",
+    "pydantic>=2.0.0",
+    "uvicorn[standard]>=0.24.0",
+    "openai>=2.7.2",
+]
+[project.optional-dependencies]
+dev = [
+    "pytest>=8.0.0",
+    "pytest-cov>=4.0.0",
+]
+[project.scripts]
+server = "invoiceops_env.server.app:main"
+[tool.setuptools]
+include-package-data = true
+packages = ["invoiceops_env", "invoiceops_env.server"]
+package-dir = { "invoiceops_env" = ".", "invoiceops_env.server" = "server" }
+[tool.setuptools.package-data]
+invoiceops_env = ["data/**/*.json", "*.yaml", "*.md"]

server/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""InvoiceOps environment server components."""
+from invoiceops_env.server.invoiceops_env_environment import InvoiceOpsEnvironment
+__all__ = ["InvoiceOpsEnvironment"]

server/app.py ADDED Viewed

	@@ -0,0 +1,50 @@

+"""FastAPI entrypoint for InvoiceOps."""
+try:
+    from openenv.core.env_server.http_server import create_app
+except Exception as e:  # pragma: no cover
+    raise ImportError(
+        "openenv is required for the web interface. Install dependencies with '\n    uv sync\n'"
+    ) from e
+from invoiceops_env.models import InvoiceOpsAction, InvoiceOpsObservation
+from invoiceops_env.server.invoiceops_env_environment import InvoiceOpsEnvironment
+app = create_app(
+    InvoiceOpsEnvironment,
+    InvoiceOpsAction,
+    InvoiceOpsObservation,
+    env_name="invoiceops_env",
+    max_concurrent_envs=4,
+)
+def _resolve_cli_args(
+    default_host: str = "0.0.0.0",
+    default_port: int = 8000,
+) -> tuple[str, int]:
+    import argparse
+    parser = argparse.ArgumentParser(add_help=False)
+    parser.add_argument("--host", default=default_host)
+    parser.add_argument("--port", type=int, default=default_port)
+    args, _ = parser.parse_known_args()
+    return args.host, args.port
+def main(host: str | None = None, port: int | None = None) -> None:
+    """Run the server directly via ``uv run --project . server``."""
+    import uvicorn
+    if host is None and port is None:
+        host, port = _resolve_cli_args()
+    else:
+        host = host or "0.0.0.0"
+        port = port or 8000
+    uvicorn.run(app, host=host, port=port)
+if __name__ == "__main__":
+    main()

server/fixtures.py ADDED Viewed

	@@ -0,0 +1,32 @@

+"""Fixture constants for InvoiceOps."""
+from __future__ import annotations
+from pathlib import Path
+from invoiceops_env.models import TaskId
+PACKAGE_ROOT = Path(__file__).resolve().parents[1]
+DATA_DIR = PACKAGE_ROOT / "data"
+SCENARIOS_DIR = DATA_DIR / "scenarios"
+ENV_DESCRIPTION = (
+    "Document-centric AP invoice exception handling environment with deterministic "
+    "grading for non-PO routing, duplicate-evidence review, partial-release "
+    "judgment, chronology-aware exception handling, and safe payment-release "
+    "decisions."
+)
+SCENARIOS_BY_TASK: dict[TaskId, tuple[str, ...]] = {
+    TaskId.EASY: ("easy",),
+    TaskId.MEDIUM: ("medium",),
+    TaskId.MEDIUM_PLUS: ("medium_plus",),
+    TaskId.HARD: ("hard",),
+}
+DEFAULT_SCENARIOS: dict[TaskId, str] = {
+    task: scenario_ids[0] for task, scenario_ids in SCENARIOS_BY_TASK.items()
+}
+DUPLICATE_CHECK_REF_PREFIX = "duplicate_check:"

server/grader.py ADDED Viewed

	@@ -0,0 +1,712 @@

+"""Deterministic grading for InvoiceOps cases."""
+from __future__ import annotations
+from dataclasses import dataclass
+from invoiceops_env.models import (
+    CaseNote,
+    DecisionBand,
+    Disposition,
+    HeaderResolution,
+    HeaderScoreReport,
+    IssueNoteReport,
+    LineResolution,
+    LineScoreReport,
+    PaymentRecommendation,
+    ReasonCode,
+    SubmissionReport,
+)
+from invoiceops_env.server.scenario_loader import (
+    HeaderExpectation,
+    NoteExpectation,
+    ResolutionExpectation,
+    ScenarioFixture,
+)
+LINE_DISPOSITION_WEIGHT = 0.55
+LINE_REASON_WEIGHT = 0.15
+LINE_ROUTE_WEIGHT = 0.30
+HEADER_RECOMMENDATION_WEIGHT = 0.55
+HEADER_REASON_WEIGHT = 0.15
+HEADER_ROUTE_WEIGHT = 0.30
+NOTE_REASON_WEIGHT = 0.65
+NOTE_EVIDENCE_WEIGHT = 0.35
+AUX_REASON_QUALITY_WEIGHT = 0.25
+AUX_EVIDENCE_WEIGHT = 0.45
+AUX_DOCUMENTATION_WEIGHT = 0.20
+AUX_EFFICIENCY_WEIGHT = 0.10
+BAND_CORE_WEIGHT = 0.60
+BAND_AUXILIARY_WEIGHT = 0.40
+BAND_RANGES: dict[DecisionBand, tuple[float, float]] = {
+    DecisionBand.BEST: (0.80, 1.00),
+    DecisionBand.SAFE_SUBOPTIMAL: (0.50, 0.79),
+    DecisionBand.WRONG: (0.05, 0.45),
+    DecisionBand.UNSAFE: (0.00, 0.15),
+}
+HEADER_TO_LINE_DISPOSITION: dict[PaymentRecommendation, Disposition] = {
+    PaymentRecommendation.RELEASE_APPROVED_LINES: Disposition.APPROVE,
+    PaymentRecommendation.HOLD_FULL_INVOICE: Disposition.HOLD,
+    PaymentRecommendation.REJECT_FULL_INVOICE: Disposition.REJECT,
+    PaymentRecommendation.ESCALATE_CASE: Disposition.ESCALATE,
+}
+CONSERVATIVE_LINE_DISPOSITIONS = {
+    Disposition.HOLD,
+    Disposition.ESCALATE,
+    Disposition.REJECT,
+}
+CONSERVATIVE_HEADER_RECOMMENDATIONS = {
+    PaymentRecommendation.HOLD_FULL_INVOICE,
+    PaymentRecommendation.ESCALATE_CASE,
+    PaymentRecommendation.REJECT_FULL_INVOICE,
+}
+@dataclass(frozen=True)
+class ReviewTrace:
+    ref_steps: dict[str, int]
+    steps_used: int
+    invalid_actions: int = 0
+    redundant_actions: int = 0
+def _f1(predicted: set[str], expected: set[str]) -> float:
+    if not predicted and not expected:
+        return 1.0
+    if not predicted or not expected:
+        return 0.0
+    true_positives = len(predicted & expected)
+    precision = true_positives / len(predicted)
+    recall = true_positives / len(expected)
+    if precision + recall == 0:
+        return 0.0
+    return (2 * precision * recall) / (precision + recall)
+def _reason_score(
+    reason_codes: list[ReasonCode] | None,
+    accepted_reason_sets: list[list[str]],
+) -> float:
+    if not accepted_reason_sets:
+        return 1.0
+    if not reason_codes:
+        return 0.0
+    predicted = {reason.value for reason in reason_codes}
+    return max(_f1(predicted, set(expected)) for expected in accepted_reason_sets)
+def _route_score(route_to_value: str | None, accepted_routes: list[str]) -> float:
+    if not accepted_routes:
+        return 1.0
+    if route_to_value is None:
+        return 0.0
+    return 1.0 if route_to_value in accepted_routes else 0.0
+def _normalized_weighted_score(components: list[tuple[float, float]]) -> float:
+    active_weight = sum(weight for _, weight in components if weight > 0.0)
+    if active_weight <= 0.0:
+        return 0.0
+    return sum(score * weight for score, weight in components if weight > 0.0) / active_weight
+def _timely_refs(
+    cited_refs: list[str] | None,
+    ref_steps: dict[str, int],
+    saved_at_step: int | None,
+) -> set[str]:
+    if saved_at_step is None or not cited_refs:
+        return set()
+    timely_refs: set[str] = set()
+    for ref in cited_refs:
+        ref_step = ref_steps.get(ref)
+        if ref_step is not None and ref_step < saved_at_step:
+            timely_refs.add(ref)
+    return timely_refs
+def _observed_refs_before_step(
+    ref_steps: dict[str, int],
+    saved_at_step: int | None,
+) -> set[str]:
+    if saved_at_step is None:
+        return set()
+    return {
+        ref
+        for ref, ref_step in ref_steps.items()
+        if ref_step < saved_at_step
+    }
+def _evidence_score(
+    cited_refs: list[str] | None,
+    decisive_refs: list[str],
+    ref_steps: dict[str, int],
+    saved_at_step: int | None,
+) -> float:
+    if not decisive_refs:
+        return 1.0
+    timely_cited_refs = _timely_refs(cited_refs, ref_steps, saved_at_step)
+    if not timely_cited_refs:
+        return 0.0
+    return _f1(timely_cited_refs, set(decisive_refs))
+def _gating_refs_satisfied(
+    gating_refs: list[str],
+    ref_steps: dict[str, int],
+    saved_at_step: int | None,
+) -> bool:
+    if not gating_refs:
+        return True
+    # Band gating depends on what the agent had already uncovered in time,
+    # not on whether every observed ref was restated in the saved action.
+    observed_refs = _observed_refs_before_step(ref_steps, saved_at_step)
+    return set(gating_refs).issubset(observed_refs)
+def _accepted_dispositions(score_map: dict[str, float]) -> list[Disposition]:
+    return [
+        disposition
+        for disposition in Disposition
+        if score_map.get(disposition.value, 0.0) > 0.0
+    ]
+def _accepted_recommendations(
+    score_map: dict[str, float],
+) -> list[PaymentRecommendation]:
+    return [
+        recommendation
+        for recommendation in PaymentRecommendation
+        if score_map.get(recommendation.value, 0.0) > 0.0
+    ]
+def _max_positive_score(score_map: dict[str, float]) -> float:
+    positive_scores = [score for score in score_map.values() if score > 0.0]
+    return max(positive_scores) if positive_scores else 0.0
+def _max_suboptimal_positive_score(score_map: dict[str, float]) -> float:
+    best_score = _max_positive_score(score_map)
+    suboptimal_positive_scores = [
+        score for score in score_map.values() if 0.0 < score < best_score
+    ]
+    return max(suboptimal_positive_scores) if suboptimal_positive_scores else 0.0
+def _core_line_score(disposition_score: float, route_score: float, has_route: bool) -> float:
+    return _normalized_weighted_score(
+        [
+            (disposition_score, 0.70),
+            (route_score, 0.30 if has_route else 0.0),
+        ]
+    )
+def _core_header_score(
+    recommendation_score: float,
+    route_score: float,
+    has_route: bool,
+) -> float:
+    return _normalized_weighted_score(
+        [
+            (recommendation_score, 0.70),
+            (route_score, 0.30 if has_route else 0.0),
+        ]
+    )
+def _grade_note_expectation(
+    expectation: NoteExpectation,
+    notes: list[CaseNote],
+    trace: ReviewTrace,
+) -> IssueNoteReport:
+    best_note_score = 0.0
+    best_reason = 0.0
+    best_evidence = 0.0
+    for note in notes:
+        reason_score = _reason_score(note.reason_codes, expectation.accepted_reason_sets)
+        evidence_score = _evidence_score(
+            note.evidence_refs,
+            expectation.decisive_refs,
+            trace.ref_steps,
+            note.saved_at_step,
+        )
+        note_score = (
+            (NOTE_REASON_WEIGHT * reason_score)
+            + (NOTE_EVIDENCE_WEIGHT * evidence_score)
+        )
+        if note_score > best_note_score:
+            best_note_score = note_score
+            best_reason = reason_score
+            best_evidence = evidence_score
+    return IssueNoteReport(
+        issue_id=expectation.issue_id,
+        note_score=round(best_note_score, 4),
+        reason_score=round(best_reason, 4),
+        evidence_score=round(best_evidence, 4),
+    )
+def _grade_header(
+    expectation: HeaderExpectation,
+    header_resolution: HeaderResolution | None,
+    ref_steps: dict[str, int],
+) -> HeaderScoreReport:
+    recommendation_value = (
+        header_resolution.payment_recommendation.value
+        if header_resolution is not None
+        else None
+    )
+    recommendation_score = (
+        expectation.score_map.get(recommendation_value or "", 0.0)
+        if header_resolution is not None
+        else 0.0
+    )
+    reason_score = _reason_score(
+        header_resolution.reason_codes if header_resolution is not None else None,
+        expectation.accepted_reason_sets,
+    )
+    route_score = _route_score(
+        (
+            header_resolution.route_to.value
+            if header_resolution is not None and header_resolution.route_to is not None
+            else None
+        ),
+        expectation.accepted_routes,
+    )
+    evidence_score = _evidence_score(
+        header_resolution.evidence_refs if header_resolution is not None else None,
+        expectation.decisive_refs,
+        ref_steps,
+        header_resolution.saved_at_step if header_resolution is not None else None,
+    )
+    header_score = _normalized_weighted_score(
+        [
+            (recommendation_score, HEADER_RECOMMENDATION_WEIGHT),
+            (
+                reason_score,
+                HEADER_REASON_WEIGHT if expectation.accepted_reason_sets else 0.0,
+            ),
+            (route_score, HEADER_ROUTE_WEIGHT if expectation.accepted_routes else 0.0),
+        ]
+    )
+    return HeaderScoreReport(
+        header_score=round(header_score, 4),
+        recommendation_score=round(recommendation_score, 4),
+        reason_score=round(reason_score, 4),
+        route_score=round(route_score, 4),
+        evidence_score=round(evidence_score, 4),
+        accepted_recommendations=_accepted_recommendations(expectation.score_map),
+    )
+def _mirrored_single_line_resolution(
+    scenario: ScenarioFixture,
+    line_resolutions: dict[str, LineResolution],
+    header_resolution: HeaderResolution | None,
+) -> dict[str, LineResolution]:
+    # Single-line warm-up cases should not crater solely because the agent saved
+    # the correct header decision but omitted the redundant line decision.
+    if header_resolution is None or len(scenario.hidden_truth.line_expectations) != 1:
+        return line_resolutions
+    line_id, expectation = next(iter(scenario.hidden_truth.line_expectations.items()))
+    if line_id in line_resolutions:
+        return line_resolutions
+    route_value = (
+        header_resolution.route_to.value
+        if header_resolution.route_to is not None
+        else None
+    )
+    if expectation.accepted_routes and route_value not in expectation.accepted_routes:
+        return line_resolutions
+    mirrored_resolution = LineResolution(
+        resolution_id=f"{header_resolution.resolution_id}-mirror-line",
+        line_id=line_id,
+        disposition=HEADER_TO_LINE_DISPOSITION[header_resolution.payment_recommendation],
+        reason_codes=list(header_resolution.reason_codes),
+        evidence_refs=list(header_resolution.evidence_refs),
+        route_to=header_resolution.route_to,
+        saved_at_step=header_resolution.saved_at_step,
+    )
+    return {**line_resolutions, line_id: mirrored_resolution}
+def _line_is_best(
+    expectation: ResolutionExpectation,
+    disposition_score: float,
+    route_score: float,
+    gating_ok: bool,
+) -> bool:
+    if not gating_ok:
+        return False
+    if disposition_score <= 0.0:
+        return False
+    if expectation.accepted_routes and route_score <= 0.0:
+        return False
+    return disposition_score >= _max_positive_score(expectation.score_map)
+def _line_is_safe(
+    expectation: ResolutionExpectation,
+    resolution: LineResolution | None,
+    disposition_score: float,
+    route_score: float,
+    best_gating_ok: bool,
+    safe_gating_ok: bool,
+) -> bool:
+    if resolution is None:
+        return False
+    if disposition_score <= 0.0:
+        return False
+    if expectation.accepted_routes and route_score <= 0.0:
+        return False
+    if best_gating_ok:
+        return True
+    return (
+        safe_gating_ok
+        and resolution.disposition in CONSERVATIVE_LINE_DISPOSITIONS
+    )
+def _header_is_best(
+    expectation: HeaderExpectation,
+    recommendation_score: float,
+    route_score: float,
+    gating_ok: bool,
+) -> bool:
+    if not gating_ok:
+        return False
+    if recommendation_score <= 0.0:
+        return False
+    if expectation.accepted_routes and route_score <= 0.0:
+        return False
+    return recommendation_score >= _max_positive_score(expectation.score_map)
+def _header_is_safe(
+    expectation: HeaderExpectation,
+    header_resolution: HeaderResolution | None,
+    recommendation_score: float,
+    route_score: float,
+    best_gating_ok: bool,
+    safe_gating_ok: bool,
+) -> bool:
+    if header_resolution is None:
+        return False
+    if recommendation_score <= 0.0:
+        return False
+    if expectation.accepted_routes and route_score <= 0.0:
+        return False
+    if best_gating_ok:
+        return True
+    return (
+        safe_gating_ok
+        and header_resolution.payment_recommendation
+        in CONSERVATIVE_HEADER_RECOMMENDATIONS
+    )
+def grade_case(
+    scenario: ScenarioFixture,
+    line_resolutions: dict[str, LineResolution],
+    header_resolution: HeaderResolution | None,
+    notes: dict[str, CaseNote],
+    trace: ReviewTrace,
+) -> SubmissionReport:
+    line_resolutions = _mirrored_single_line_resolution(
+        scenario,
+        line_resolutions,
+        header_resolution,
+    )
+    line_reports: list[LineScoreReport] = []
+    weighted_line_resolution = 0.0
+    weighted_line_core = 0.0
+    weighted_line_reason = 0.0
+    weighted_line_evidence = 0.0
+    total_amount = sum(
+        expectation.amount
+        for expectation in scenario.hidden_truth.line_expectations.values()
+    )
+    total_amount = total_amount or 1.0
+    unsafe_findings: list[str] = []
+    all_lines_best = True
+    all_lines_safe = True
+    for line_id, expectation in scenario.hidden_truth.line_expectations.items():
+        resolution = line_resolutions.get(line_id)
+        disposition_value = resolution.disposition.value if resolution is not None else ""
+        disposition_score = expectation.score_map.get(disposition_value, 0.0)
+        reason_score = _reason_score(
+            resolution.reason_codes if resolution is not None else None,
+            expectation.accepted_reason_sets,
+        )
+        route_score = _route_score(
+            (
+                resolution.route_to.value
+                if resolution is not None and resolution.route_to is not None
+                else None
+            ),
+            expectation.accepted_routes,
+        )
+        evidence_score = _evidence_score(
+            resolution.evidence_refs if resolution is not None else None,
+            expectation.decisive_refs,
+            trace.ref_steps,
+            resolution.saved_at_step if resolution is not None else None,
+        )
+        best_gating_ok = _gating_refs_satisfied(
+            expectation.gating_refs,
+            trace.ref_steps,
+            resolution.saved_at_step if resolution is not None else None,
+        )
+        safe_gating_ok = _gating_refs_satisfied(
+            expectation.safe_gating_refs or expectation.gating_refs,
+            trace.ref_steps,
+            resolution.saved_at_step if resolution is not None else None,
+        )
+        line_score = _normalized_weighted_score(
+            [
+                (disposition_score, LINE_DISPOSITION_WEIGHT),
+                (
+                    reason_score,
+                    LINE_REASON_WEIGHT if expectation.accepted_reason_sets else 0.0,
+                ),
+                (route_score, LINE_ROUTE_WEIGHT if expectation.accepted_routes else 0.0),
+            ]
+        )
+        core_score = _core_line_score(
+            disposition_score,
+            route_score,
+            bool(expectation.accepted_routes),
+        )
+        effective_core_score = 0.0
+        # Best credit needs the full best gating refs. Conservative actions can
+        # still earn capped core credit when the scenario defines safe gating.
+        if best_gating_ok:
+            effective_core_score = core_score
+        elif (
+            resolution is not None
+            and safe_gating_ok
+            and disposition_score > 0.0
+            and resolution.disposition in CONSERVATIVE_LINE_DISPOSITIONS
+        ):
+            capped_disposition_score = min(
+                disposition_score,
+                _max_suboptimal_positive_score(expectation.score_map),
+            )
+            if capped_disposition_score > 0.0:
+                effective_core_score = _core_line_score(
+                    capped_disposition_score,
+                    route_score,
+                    bool(expectation.accepted_routes),
+                )
+        weight = expectation.amount / total_amount
+        weighted_line_resolution += line_score * weight
+        weighted_line_core += effective_core_score * weight
+        weighted_line_reason += reason_score * weight
+        weighted_line_evidence += evidence_score * weight
+        line_reports.append(
+            LineScoreReport(
+                line_id=line_id,
+                line_score=round(line_score, 4),
+                disposition_score=round(disposition_score, 4),
+                reason_score=round(reason_score, 4),
+                route_score=round(route_score, 4),
+                evidence_score=round(evidence_score, 4),
+                accepted_dispositions=_accepted_dispositions(expectation.score_map),
+            )
+        )
+        all_lines_best = all_lines_best and _line_is_best(
+            expectation,
+            disposition_score,
+            route_score,
+            best_gating_ok,
+        )
+        all_lines_safe = all_lines_safe and _line_is_safe(
+            expectation,
+            resolution,
+            disposition_score,
+            route_score,
+            best_gating_ok,
+            safe_gating_ok,
+        )
+        if (
+            expectation.unsafe_approve
+            and resolution is not None
+            and resolution.disposition is Disposition.APPROVE
+        ):
+            unsafe_findings.append(f"unsafe approval on line {line_id}")
+    header_report = _grade_header(
+        scenario.hidden_truth.header_expectation,
+        header_resolution,
+        trace.ref_steps,
+    )
+    header_best_gating_ok = _gating_refs_satisfied(
+        scenario.hidden_truth.header_expectation.gating_refs,
+        trace.ref_steps,
+        header_resolution.saved_at_step if header_resolution is not None else None,
+    )
+    header_safe_gating_ok = _gating_refs_satisfied(
+        scenario.hidden_truth.header_expectation.safe_gating_refs
+        or scenario.hidden_truth.header_expectation.gating_refs,
+        trace.ref_steps,
+        header_resolution.saved_at_step if header_resolution is not None else None,
+    )
+    header_core_score = _core_header_score(
+        header_report.recommendation_score,
+        header_report.route_score,
+        bool(scenario.hidden_truth.header_expectation.accepted_routes),
+    )
+    if header_best_gating_ok:
+        pass
+    elif (
+        header_resolution is not None
+        and header_safe_gating_ok
+        and header_report.recommendation_score > 0.0
+        and header_resolution.payment_recommendation
+        in CONSERVATIVE_HEADER_RECOMMENDATIONS
+    ):
+        capped_recommendation_score = min(
+            header_report.recommendation_score,
+            _max_suboptimal_positive_score(
+                scenario.hidden_truth.header_expectation.score_map
+            ),
+        )
+        if capped_recommendation_score > 0.0:
+            header_core_score = _core_header_score(
+                capped_recommendation_score,
+                header_report.route_score,
+                bool(scenario.hidden_truth.header_expectation.accepted_routes),
+            )
+        else:
+            header_core_score = 0.0
+    else:
+        header_core_score = 0.0
+    reason_quality_score = (0.80 * weighted_line_reason) + (
+        0.20 * header_report.reason_score
+    )
+    resolution_score = (0.80 * weighted_line_resolution) + (
+        0.20 * header_report.header_score
+    )
+    core_decision_score = (0.80 * weighted_line_core) + (0.20 * header_core_score)
+    evidence_score = (0.80 * weighted_line_evidence) + (
+        0.20 * header_report.evidence_score
+    )
+    note_reports = [
+        _grade_note_expectation(expectation, list(notes.values()), trace)
+        for expectation in scenario.hidden_truth.note_expectations
+    ]
+    if note_reports:
+        documentation_score = sum(
+            report.note_score for report in note_reports
+        ) / len(note_reports)
+    else:
+        documentation_score = 1.0
+    extra_steps = max(
+        0,
+        trace.steps_used - scenario.hidden_truth.efficient_step_target,
+    )
+    efficiency_score = max(
+        0.0,
+        1.0
+        - (0.08 * extra_steps)
+        - (0.25 * trace.invalid_actions)
+        - (0.08 * trace.redundant_actions),
+    )
+    if header_resolution is not None:
+        header_value = header_resolution.payment_recommendation.value
+        if header_value in scenario.hidden_truth.header_expectation.unsafe_recommendations:
+            unsafe_findings.append(f"unsafe header recommendation {header_value}")
+    # Stage 1 assigns the decision band from the gated essential decisions.
+    # Stage 2 later scores within that band using evidence, notes, and efficiency.
+    if unsafe_findings:
+        decision_band = DecisionBand.UNSAFE
+    elif all_lines_best and _header_is_best(
+        scenario.hidden_truth.header_expectation,
+        header_report.recommendation_score,
+        header_report.route_score,
+        header_best_gating_ok,
+    ):
+        decision_band = DecisionBand.BEST
+    elif all_lines_safe and _header_is_safe(
+        scenario.hidden_truth.header_expectation,
+        header_resolution,
+        header_report.recommendation_score,
+        header_report.route_score,
+        header_best_gating_ok,
+        header_safe_gating_ok,
+    ):
+        decision_band = DecisionBand.SAFE_SUBOPTIMAL
+    else:
+        decision_band = DecisionBand.WRONG
+    auxiliary_score = _normalized_weighted_score(
+        [
+            (reason_quality_score, AUX_REASON_QUALITY_WEIGHT),
+            (evidence_score, AUX_EVIDENCE_WEIGHT),
+            (documentation_score, AUX_DOCUMENTATION_WEIGHT),
+            (efficiency_score, AUX_EFFICIENCY_WEIGHT),
+        ]
+    )
+    band_progress = _normalized_weighted_score(
+        [
+            (core_decision_score, BAND_CORE_WEIGHT),
+            (auxiliary_score, BAND_AUXILIARY_WEIGHT),
+        ]
+    )
+    band_floor, band_ceiling = BAND_RANGES[decision_band]
+    total_score = band_floor + ((band_ceiling - band_floor) * band_progress)
+    total_score = max(0.0, min(1.0, total_score))
+    return SubmissionReport(
+        decision_band=decision_band,
+        total_score=round(total_score, 4),
+        core_decision_score=round(core_decision_score, 4),
+        reason_quality_score=round(reason_quality_score, 4),
+        auxiliary_score=round(auxiliary_score, 4),
+        resolution_score=round(resolution_score, 4),
+        evidence_score=round(evidence_score, 4),
+        documentation_score=round(documentation_score, 4),
+        efficiency_score=round(efficiency_score, 4),
+        safety_cap_applied=(
+            round(BAND_RANGES[DecisionBand.UNSAFE][1], 4)
+            if decision_band is DecisionBand.UNSAFE
+            else None
+        ),
+        unsafe_findings=unsafe_findings,
+        line_reports=line_reports,
+        header_report=header_report,
+        note_reports=note_reports,
+    )

server/invoiceops_env_environment.py ADDED Viewed

	@@ -0,0 +1,491 @@

+"""InvoiceOps environment implementation."""
+from __future__ import annotations
+from uuid import uuid4
+from openenv.core.env_server.interfaces import Environment
+from openenv.core.env_server.types import EnvironmentMetadata
+from invoiceops_env.models import (
+    ActionType,
+    CaseNote,
+    Disposition,
+    DuplicateCandidate,
+    HeaderResolution,
+    InvoiceOpsAction,
+    InvoiceOpsObservation,
+    InvoiceOpsState,
+    LineResolution,
+    PaymentRecommendation,
+    Progress,
+)
+from invoiceops_env.server.fixtures import ENV_DESCRIPTION
+from invoiceops_env.server.fixtures import DUPLICATE_CHECK_REF_PREFIX
+from invoiceops_env.server.grader import ReviewTrace, grade_case
+from invoiceops_env.server.reward_engine import DEFAULT_REWARD_CONFIG
+from invoiceops_env.server.scenario_loader import (
+    ScenarioFixture,
+    artifact_lookup,
+    artifact_references,
+    exception_lookup,
+    exception_summaries,
+    line_ids_for_scenario,
+    load_scenario,
+)
+class InvoiceOpsEnvironment(
+    Environment[InvoiceOpsAction, InvoiceOpsObservation, InvoiceOpsState]
+):
+    """Accounts-payable invoice exception handling environment."""
+    SUPPORTS_CONCURRENT_SESSIONS: bool = True
+    def __init__(self) -> None:
+        super().__init__()
+        self._reward_config = DEFAULT_REWARD_CONFIG
+        self._scenario: ScenarioFixture | None = None
+        self._artifact_map = {}
+        self._exception_map = {}
+        self._state = InvoiceOpsState(episode_id=str(uuid4()), step_count=0)
+        self._opened_artifact_ids: set[str] = set()
+        self._inspected_exception_ids: set[str] = set()
+        self._duplicate_checks_run: set[str] = set()
+        self._duplicate_candidates: list[DuplicateCandidate] = []
+        self._notes: dict[str, CaseNote] = {}
+        self._line_resolutions: dict[str, LineResolution] = {}
+        self._header_resolution: HeaderResolution | None = None
+        self._ref_steps: dict[str, int] = {}
+        self._invalid_actions = 0
+        self._redundant_actions = 0
+        self._submitted = False
+        self._submission_report = None
+        self._current_artifact_id: str | None = None
+        self._current_exception_id: str | None = None
+    def get_metadata(self) -> EnvironmentMetadata:
+        return EnvironmentMetadata(
+            name="InvoiceOpsEnvironment",
+            description=ENV_DESCRIPTION,
+            version="0.1.0",
+        )
+    def reset(
+        self,
+        seed: int | None = None,
+        episode_id: str | None = None,
+        task_id: str | None = None,
+        scenario_id: str | None = None,
+        **kwargs: object,
+    ) -> InvoiceOpsObservation:
+        del seed, kwargs
+        self._scenario = load_scenario(task_id=task_id, scenario_id=scenario_id)
+        self._artifact_map = artifact_lookup(self._scenario)
+        self._exception_map = exception_lookup(self._scenario)
+        self._state = InvoiceOpsState(
+            episode_id=episode_id or str(uuid4()),
+            step_count=0,
+            task_id=self._scenario.task_id,
+            scenario_id=self._scenario.scenario_id,
+            case_id=self._scenario.case_id,
+            current_artifact_id=None,
+            submitted=False,
+            step_limit=self._scenario.step_limit,
+            duplicate_checks_run=0,
+            invalid_actions=0,
+            redundant_actions=0,
+        )
+        self._opened_artifact_ids = set()
+        self._inspected_exception_ids = set()
+        self._duplicate_checks_run = set()
+        self._duplicate_candidates = []
+        self._notes = {}
+        self._line_resolutions = {}
+        self._header_resolution = None
+        self._ref_steps = {}
+        self._invalid_actions = 0
+        self._redundant_actions = 0
+        self._submitted = False
+        self._submission_report = None
+        self._current_artifact_id = None
+        self._current_exception_id = None
+        return self._build_observation(
+            message=f"{self._scenario.title} ready.",
+            reward=0.0,
+            done=False,
+        )
+    def step(
+        self,
+        action: InvoiceOpsAction,
+        timeout_s: float | None = None,
+        **kwargs: object,
+    ) -> InvoiceOpsObservation:
+        del timeout_s, kwargs
+        if self._scenario is None:
+            raise RuntimeError("reset() must be called before step()")
+        if self._submitted:
+            return self._invalid_observation(
+                "Case already submitted.",
+                self._reward_config.invalid_action_penalty,
+                done=True,
+            )
+        self._state.step_count += 1
+        reward = self._reward_config.step_cost
+        done = False
+        message = "Action processed."
+        match action.action_type:
+            case ActionType.OPEN_ARTIFACT:
+                artifact_id = action.artifact_id or ""
+                artifact = self._artifact_map.get(artifact_id)
+                if artifact is None:
+                    return self._invalid_observation(
+                        f"Unknown artifact_id: {action.artifact_id}",
+                        reward + self._reward_config.invalid_action_penalty,
+                    )
+                self._current_artifact_id = artifact_id
+                self._state.current_artifact_id = artifact_id
+                if artifact_id in self._opened_artifact_ids:
+                    self._redundant_actions += 1
+                    self._state.redundant_actions = self._redundant_actions
+                    reward += self._reward_config.redundant_open_penalty
+                    message = f"Artifact {artifact_id} was already opened."
+                else:
+                    self._opened_artifact_ids.add(artifact_id)
+                    self._ref_steps[artifact_id] = self._state.step_count
+                    reward += self._reward_config.first_open_artifact
+                    message = f"Opened artifact {artifact_id}."
+            case ActionType.INSPECT_EXCEPTION:
+                exception_id = action.exception_id or ""
+                exception = self._exception_map.get(exception_id)
+                if exception is None:
+                    return self._invalid_observation(
+                        f"Unknown exception_id: {action.exception_id}",
+                        reward + self._reward_config.invalid_action_penalty,
+                    )
+                self._current_exception_id = exception_id
+                if exception_id not in self._inspected_exception_ids:
+                    self._inspected_exception_ids.add(exception_id)
+                    self._ref_steps[exception_id] = self._state.step_count
+                    reward += self._reward_config.inspect_exception
+                message = f"Inspected exception {exception_id}."
+            case ActionType.RUN_DUPLICATE_CHECK:
+                strategy = action.match_strategy
+                assert strategy is not None
+                if strategy.value in self._duplicate_checks_run:
+                    self._redundant_actions += 1
+                    self._state.redundant_actions = self._redundant_actions
+                    reward += self._reward_config.redundant_duplicate_penalty
+                    message = f"Duplicate check {strategy.value} was already run."
+                else:
+                    self._duplicate_checks_run.add(strategy.value)
+                    self._state.duplicate_checks_run = len(self._duplicate_checks_run)
+                    self._ref_steps[
+                        f"{DUPLICATE_CHECK_REF_PREFIX}{strategy.value}"
+                    ] = self._state.step_count
+                    reward += self._reward_config.run_duplicate_check
+                    self._duplicate_candidates = [
+                        candidate
+                        for candidate in self._scenario.duplicate_candidates
+                        if strategy in candidate.supported_match_strategies
+                    ]
+                    for candidate in self._duplicate_candidates:
+                        self._ref_steps.setdefault(
+                            candidate.candidate_id,
+                            self._state.step_count,
+                        )
+                    message = (
+                        f"Duplicate search completed with {len(self._duplicate_candidates)} "
+                        f"candidate(s)."
+                    )
+            case ActionType.ADD_NOTE:
+                invalid_ref = self._first_invalid_ref(action.evidence_refs)
+                if invalid_ref is not None:
+                    return self._invalid_observation(
+                        f"Unknown evidence ref: {invalid_ref}",
+                        reward + self._reward_config.invalid_action_penalty,
+                    )
+                note_id = f"N-{len(self._notes) + 1:02d}"
+                self._notes[note_id] = CaseNote(
+                    note_id=note_id,
+                    note_type=action.note_type,
+                    reason_codes=action.reason_codes,
+                    evidence_refs=action.evidence_refs,
+                    text=(action.text or "").strip(),
+                    saved_at_step=self._state.step_count,
+                )
+                reward += self._reward_config.valid_note
+                message = f"Saved note {note_id}."
+            case ActionType.SET_LINE_RESOLUTION:
+                line_id = action.line_id or ""
+                if line_id not in line_ids_for_scenario(self._scenario):
+                    return self._invalid_observation(
+                        f"Unknown line_id: {action.line_id}",
+                        reward + self._reward_config.invalid_action_penalty,
+                    )
+                invalid_ref = self._first_invalid_ref(action.evidence_refs)
+                if invalid_ref is not None:
+                    return self._invalid_observation(
+                        f"Unknown evidence ref: {invalid_ref}",
+                        reward + self._reward_config.invalid_action_penalty,
+                    )
+                resolution_id = f"LR-{line_id}"
+                is_revision = line_id in self._line_resolutions
+                self._line_resolutions[line_id] = LineResolution(
+                    resolution_id=resolution_id,
+                    line_id=line_id,
+                    disposition=action.disposition,
+                    reason_codes=action.reason_codes,
+                    evidence_refs=action.evidence_refs,
+                    route_to=action.route_to,
+                    saved_at_step=self._state.step_count,
+                )
+                reward += (
+                    self._reward_config.revision_penalty
+                    if is_revision
+                    else self._reward_config.valid_line_resolution
+                )
+                if is_revision:
+                    self._redundant_actions += 1
+                    self._state.redundant_actions = self._redundant_actions
+                message = f"Saved line resolution for {line_id}."
+            case ActionType.SET_HEADER_RESOLUTION:
+                invalid_ref = self._first_invalid_ref(action.evidence_refs)
+                if invalid_ref is not None:
+                    return self._invalid_observation(
+                        f"Unknown evidence ref: {invalid_ref}",
+                        reward + self._reward_config.invalid_action_penalty,
+                    )
+                is_revision = self._header_resolution is not None
+                self._header_resolution = HeaderResolution(
+                    resolution_id="HR-001",
+                    payment_recommendation=action.payment_recommendation,
+                    reason_codes=action.reason_codes,
+                    evidence_refs=action.evidence_refs,
+                    route_to=action.route_to,
+                    saved_at_step=self._state.step_count,
+                )
+                reward += (
+                    self._reward_config.revision_penalty
+                    if is_revision
+                    else self._reward_config.valid_header_resolution
+                )
+                if is_revision:
+                    self._redundant_actions += 1
+                    self._state.redundant_actions = self._redundant_actions
+                message = "Saved header recommendation."
+            case ActionType.SUBMIT_CASE:
+                invalid_submission = self._validate_submission_refs(
+                    action.note_ids,
+                    action.line_resolution_ids,
+                    action.header_resolution_id,
+                )
+                if invalid_submission is not None:
+                    return self._invalid_observation(
+                        invalid_submission,
+                        reward + self._reward_config.invalid_action_penalty,
+                    )
+                consistency_error = self._validate_submission_consistency()
+                if consistency_error is not None:
+                    return self._invalid_observation(
+                        consistency_error,
+                        reward + self._reward_config.invalid_action_penalty,
+                    )
+                self._submission_report = grade_case(
+                    self._scenario,
+                    self._line_resolutions,
+                    self._header_resolution,
+                    self._notes,
+                    ReviewTrace(
+                        ref_steps=self._ref_steps,
+                        steps_used=self._state.step_count,
+                        invalid_actions=self._invalid_actions,
+                        redundant_actions=self._redundant_actions,
+                    ),
+                )
+                self._submitted = True
+                self._state.submitted = True
+                reward = self._submission_report.total_score
+                done = True
+                message = (
+                    f"Case submitted with score {self._submission_report.total_score:.4f}."
+                )
+        if not done and self._state.step_count >= self._state.step_limit:
+            self._submission_report = grade_case(
+                self._scenario,
+                self._line_resolutions,
+                self._header_resolution,
+                self._notes,
+                ReviewTrace(
+                    ref_steps=self._ref_steps,
+                    steps_used=self._state.step_count,
+                    invalid_actions=self._invalid_actions,
+                    redundant_actions=self._redundant_actions,
+                ),
+            )
+            self._submitted = True
+            self._state.submitted = True
+            reward = self._submission_report.total_score
+            done = True
+            message = (
+                "Step budget exhausted. "
+                f"Auto-submitted with score {self._submission_report.total_score:.4f}."
+            )
+        return self._build_observation(message=message, reward=reward, done=done)
+    def _validate_submission_refs(
+        self,
+        note_ids: list[str],
+        line_resolution_ids: list[str],
+        header_resolution_id: str | None,
+    ) -> str | None:
+        if note_ids:
+            missing = [note_id for note_id in note_ids if note_id not in self._notes]
+            if missing:
+                return f"Unknown note_ids in submit_case: {missing}"
+        if line_resolution_ids:
+            known = {resolution.resolution_id for resolution in self._line_resolutions.values()}
+            missing = [resolution_id for resolution_id in line_resolution_ids if resolution_id not in known]
+            if missing:
+                return f"Unknown line_resolution_ids in submit_case: {missing}"
+        if header_resolution_id is not None:
+            if self._header_resolution is None or self._header_resolution.resolution_id != header_resolution_id:
+                return f"Unknown header_resolution_id in submit_case: {header_resolution_id}"
+        return None
+    def _validate_submission_consistency(self) -> str | None:
+        approved_line_ids = sorted(
+            line_id
+            for line_id, resolution in self._line_resolutions.items()
+            if resolution.disposition is Disposition.APPROVE
+        )
+        escalated_without_route = sorted(
+            line_id
+            for line_id, resolution in self._line_resolutions.items()
+            if resolution.disposition is Disposition.ESCALATE and resolution.route_to is None
+        )
+        if escalated_without_route:
+            return f"Escalated lines require route_to: {escalated_without_route}"
+        header_resolution = self._header_resolution
+        if header_resolution is None:
+            return None
+        recommendation = header_resolution.payment_recommendation
+        if (
+            recommendation is PaymentRecommendation.ESCALATE_CASE
+            and header_resolution.route_to is None
+        ):
+            return "escalate_case requires route_to."
+        if (
+            recommendation is PaymentRecommendation.RELEASE_APPROVED_LINES
+            and not approved_line_ids
+        ):
+            return "release_approved_lines requires at least one approved line."
+        if (
+            recommendation is PaymentRecommendation.REJECT_FULL_INVOICE
+            and approved_line_ids
+        ):
+            return (
+                f"{recommendation.value} is inconsistent with approved lines: "
+                f"{approved_line_ids}"
+            )
+        return None
+    def _first_invalid_ref(self, evidence_refs: list[str]) -> str | None:
+        known_refs = set(self._ref_steps)
+        for ref in evidence_refs:
+            if ref not in known_refs:
+                return ref
+        return None
+    def _invalid_observation(
+        self,
+        message: str,
+        reward: float,
+        done: bool = False,
+    ) -> InvoiceOpsObservation:
+        self._invalid_actions += 1
+        self._state.invalid_actions = self._invalid_actions
+        return self._build_observation(message=message, reward=reward, done=done)
+    def _build_observation(
+        self,
+        *,
+        message: str,
+        reward: float,
+        done: bool,
+    ) -> InvoiceOpsObservation:
+        scenario = self._scenario
+        if scenario is None:
+            raise RuntimeError("Scenario is not loaded")
+        steps_remaining = max(0, self._state.step_limit - self._state.step_count)
+        progress = Progress(
+            steps_used=self._state.step_count,
+            steps_remaining=steps_remaining,
+            opened_artifacts=len(self._opened_artifact_ids),
+            inspected_exceptions=len(self._inspected_exception_ids),
+            notes_count=len(self._notes),
+            line_resolutions=len(self._line_resolutions),
+            duplicate_checks_run=len(self._duplicate_checks_run),
+            invalid_actions=self._invalid_actions,
+            redundant_actions=self._redundant_actions,
+            submitted=self._submitted,
+        )
+        opened_artifact = (
+            self._artifact_map[self._current_artifact_id]
+            if self._current_artifact_id is not None
+            else None
+        )
+        inspected_exception = (
+            self._exception_map[self._current_exception_id]
+            if self._current_exception_id is not None
+            else None
+        )
+        return InvoiceOpsObservation(
+            message=message,
+            task_id=scenario.task_id,
+            scenario_id=scenario.scenario_id,
+            title=scenario.title,
+            description=scenario.description,
+            queue_card=scenario.queue_card,
+            available_artifacts=artifact_references(scenario),
+            opened_artifact=opened_artifact,
+            visible_exceptions=exception_summaries(scenario),
+            inspected_exception=inspected_exception,
+            duplicate_candidates=self._duplicate_candidates,
+            draft_notes=list(self._notes.values()),
+            draft_line_resolutions=list(self._line_resolutions.values()),
+            draft_header_resolution=self._header_resolution,
+            submission_report=self._submission_report,
+            progress=progress,
+            known_refs=sorted(self._ref_steps),
+            episode_score=(
+                self._submission_report.total_score if self._submission_report else None
+            ),
+            done=done,
+            reward=reward,
+            metadata={
+                "case_id": scenario.case_id,
+                "task_id": scenario.task_id.value,
+            },
+        )
+    @property
+    def state(self) -> InvoiceOpsState:
+        return self._state

server/reward_engine.py ADDED Viewed

	@@ -0,0 +1,23 @@

+"""Dense reward shaping for InvoiceOps."""
+from __future__ import annotations
+from dataclasses import dataclass
+@dataclass(frozen=True)
+class RewardConfig:
+    step_cost: float = -0.01
+    first_open_artifact: float = 0.02
+    inspect_exception: float = 0.03
+    run_duplicate_check: float = 0.03
+    valid_note: float = 0.03
+    valid_line_resolution: float = 0.04
+    valid_header_resolution: float = 0.05
+    invalid_action_penalty: float = -0.05
+    redundant_open_penalty: float = -0.03
+    revision_penalty: float = -0.02
+    redundant_duplicate_penalty: float = -0.03
+DEFAULT_REWARD_CONFIG = RewardConfig()

server/scenario_loader.py ADDED Viewed

	@@ -0,0 +1,186 @@

+"""Scenario loading helpers for InvoiceOps."""
+from __future__ import annotations
+import json
+from pathlib import Path
+from pydantic import Field
+from invoiceops_env.models import (
+    ArtifactReference,
+    ArtifactView,
+    DuplicateCandidate,
+    ExceptionDetail,
+    ExceptionSummary,
+    ExceptionType,
+    QueueCard,
+    RouteTarget,
+    TaskId,
+)
+from invoiceops_env.models import Model as BaseModel
+from invoiceops_env.server.fixtures import (
+    DEFAULT_SCENARIOS,
+    SCENARIOS_DIR,
+)
+class ResolutionExpectation(BaseModel):
+    amount: float = Field(..., ge=0.0)
+    score_map: dict[str, float] = Field(default_factory=dict)
+    accepted_reason_sets: list[list[str]] = Field(default_factory=list)
+    accepted_routes: list[str] = Field(default_factory=list)
+    gating_refs: list[str] = Field(default_factory=list)
+    safe_gating_refs: list[str] = Field(default_factory=list)
+    decisive_refs: list[str] = Field(default_factory=list)
+    unsafe_approve: bool = Field(default=False)
+class HeaderExpectation(BaseModel):
+    score_map: dict[str, float] = Field(default_factory=dict)
+    accepted_reason_sets: list[list[str]] = Field(default_factory=list)
+    accepted_routes: list[str] = Field(default_factory=list)
+    gating_refs: list[str] = Field(default_factory=list)
+    safe_gating_refs: list[str] = Field(default_factory=list)
+    decisive_refs: list[str] = Field(default_factory=list)
+    unsafe_recommendations: list[str] = Field(default_factory=list)
+    overconservative_recommendations: list[str] = Field(default_factory=list)
+class NoteExpectation(BaseModel):
+    issue_id: str
+    accepted_reason_sets: list[list[str]] = Field(default_factory=list)
+    decisive_refs: list[str] = Field(default_factory=list)
+class HiddenTruth(BaseModel):
+    line_expectations: dict[str, ResolutionExpectation] = Field(default_factory=dict)
+    header_expectation: HeaderExpectation
+    note_expectations: list[NoteExpectation] = Field(default_factory=list)
+    efficient_step_target: int = Field(default=0, ge=0)
+class ScenarioFixture(BaseModel):
+    scenario_id: str
+    task_id: TaskId
+    case_id: str
+    title: str
+    description: str
+    step_limit: int = Field(..., ge=1)
+    queue_card: QueueCard
+    artifacts: list[ArtifactView] = Field(default_factory=list)
+    exceptions: list[ExceptionDetail] = Field(default_factory=list)
+    duplicate_candidates: list[DuplicateCandidate] = Field(default_factory=list)
+    hidden_truth: HiddenTruth
+def _scenario_path_for_id(scenario_id: str) -> Path:
+    return SCENARIOS_DIR / f"{scenario_id}.json"
+def load_scenario(
+    task_id: TaskId | str | None = None,
+    scenario_id: str | None = None,
+) -> ScenarioFixture:
+    if scenario_id is None:
+        task = TaskId(task_id or TaskId.EASY)
+        scenario_id = DEFAULT_SCENARIOS[task]
+    scenario_path = _scenario_path_for_id(scenario_id)
+    if not scenario_path.exists():
+        raise ValueError(f"Unknown scenario_id: {scenario_id}")
+    with scenario_path.open("r", encoding="utf-8") as handle:
+        payload = json.load(handle)
+    scenario = ScenarioFixture.model_validate(payload)
+    if task_id is not None and scenario.task_id is not TaskId(task_id):
+        raise ValueError(
+            f"Scenario '{scenario_id}' belongs to task '{scenario.task_id.value}', "
+            f"not '{TaskId(task_id).value}'"
+        )
+    return scenario
+QUEUE_SAFE_EXCEPTION_HEADLINES: dict[ExceptionType, str] = {
+    ExceptionType.RECEIPT_QUANTITY_VARIANCE: "Receipt variance requires review",
+    ExceptionType.NON_PO_MISSING_APPROVAL: "Non-PO approval exception requires review",
+    ExceptionType.POSSIBLE_DUPLICATE: "Potential duplicate invoice requires review",
+    ExceptionType.PRICE_VARIANCE: "Price variance requires review",
+    ExceptionType.CUMULATIVE_BILLING_VARIANCE: "Cumulative billing exception requires review",
+    ExceptionType.TAX_VARIANCE: "Tax exception requires review",
+    ExceptionType.PAYMENT_TERMS_MISMATCH: "Payment terms exception requires review",
+}
+QUEUE_SAFE_EXCEPTION_HINTS: dict[ExceptionType, str] = {
+    ExceptionType.RECEIPT_QUANTITY_VARIANCE: (
+        "Inspect this exception for receipt support and quantity details."
+    ),
+    ExceptionType.NON_PO_MISSING_APPROVAL: (
+        "Inspect this exception for workflow status and approval details."
+    ),
+    ExceptionType.POSSIBLE_DUPLICATE: (
+        "Inspect this exception for duplicate-match details before deciding."
+    ),
+    ExceptionType.PRICE_VARIANCE: (
+        "Inspect this exception for invoice-vs-PO price details."
+    ),
+    ExceptionType.CUMULATIVE_BILLING_VARIANCE: (
+        "Inspect this exception for history-aware billing facts."
+    ),
+    ExceptionType.TAX_VARIANCE: "Inspect this exception for tax calculation details.",
+    ExceptionType.PAYMENT_TERMS_MISMATCH: (
+        "Inspect this exception for payment-terms comparison details."
+    ),
+}
+def artifact_lookup(scenario: ScenarioFixture) -> dict[str, ArtifactView]:
+    return {artifact.artifact_id: artifact for artifact in scenario.artifacts}
+def artifact_references(scenario: ScenarioFixture) -> list[ArtifactReference]:
+    return [
+        ArtifactReference(
+            artifact_id=artifact.artifact_id,
+            artifact_type=artifact.artifact_type,
+            title=artifact.title,
+        )
+        for artifact in scenario.artifacts
+    ]
+def exception_lookup(scenario: ScenarioFixture) -> dict[str, ExceptionDetail]:
+    return {exception.exception_id: exception for exception in scenario.exceptions}
+def exception_summaries(scenario: ScenarioFixture) -> list[ExceptionSummary]:
+    return [
+        ExceptionSummary(
+            exception_id=exception.exception_id,
+            exception_type=exception.exception_type,
+            severity=exception.severity,
+            headline=QUEUE_SAFE_EXCEPTION_HEADLINES.get(
+                exception.exception_type,
+                "Exception requires review",
+            ),
+            impacted_line_ids=exception.impacted_line_ids,
+            short_description=QUEUE_SAFE_EXCEPTION_HINTS.get(
+                exception.exception_type,
+                "Inspect this exception for detailed facts before deciding.",
+            ),
+        )
+        for exception in scenario.exceptions
+    ]
+def line_ids_for_scenario(scenario: ScenarioFixture) -> set[str]:
+    line_ids: set[str] = set(scenario.hidden_truth.line_expectations.keys())
+    for artifact in scenario.artifacts:
+        for line_item in artifact.line_items:
+            line_ids.add(line_item.line_id)
+    return line_ids
+def route_target_values() -> set[str]:
+    return {route.value for route in RouteTarget}

summarize_eval.py ADDED Viewed

	@@ -0,0 +1,157 @@

+#!/usr/bin/env python3
+from __future__ import annotations
+import argparse
+import json
+from pathlib import Path
+from statistics import mean
+from typing import Any
+TASK_ORDER = ("easy", "medium", "medium_plus", "hard")
+def find_latest_eval() -> Path:
+    candidates = sorted(Path("outputs/evals").glob("*.json"))
+    if not candidates:
+        raise FileNotFoundError("No eval JSON files found under outputs/evals/.")
+    return candidates[-1]
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Print a compact summary for an InvoiceOps eval JSON artifact."
+    )
+    parser.add_argument(
+        "paths",
+        nargs="*",
+        help="Optional eval JSON paths. Defaults to the latest file under outputs/evals/.",
+    )
+    return parser.parse_args()
+def _safe_mean(values: list[float]) -> float | None:
+    return round(mean(values), 4) if values else None
+def _request_error_count(result: dict[str, Any]) -> int:
+    attempts = result.get("model_attempts") or []
+    return sum(
+        1
+        for attempt in attempts
+        if isinstance(attempt, dict) and attempt.get("request_error")
+    )
+def summarize_eval(path: Path) -> dict[str, Any]:
+    payload = json.loads(path.read_text(encoding="utf-8"))
+    results = payload.get("results") or []
+    task_scores: dict[str, float] = {}
+    resolution_scores: list[float] = []
+    evidence_scores: list[float] = []
+    documentation_scores: list[float] = []
+    efficiency_scores: list[float] = []
+    steps: list[float] = []
+    reward_lengths: list[float] = []
+    fallback_count = 0
+    parse_failure_count = 0
+    request_error_count = 0
+    for result in results:
+        task_id = result.get("task_id")
+        score = result.get("score")
+        if isinstance(task_id, str) and isinstance(score, (int, float)):
+            task_scores[task_id] = round(float(score), 4)
+        if result.get("used_fallback") is True:
+            fallback_count += 1
+        if result.get("decision_parsed") is False:
+            parse_failure_count += 1
+        request_error_count += _request_error_count(result)
+        if isinstance(result.get("steps_used"), (int, float)):
+            steps.append(float(result["steps_used"]))
+        reward_trace = result.get("reward_trace")
+        if isinstance(reward_trace, list):
+            reward_lengths.append(float(len(reward_trace)))
+        report = result.get("submission_report")
+        if not isinstance(report, dict):
+            continue
+        for source, bucket in (
+            ("resolution_score", resolution_scores),
+            ("evidence_score", evidence_scores),
+            ("documentation_score", documentation_scores),
+            ("efficiency_score", efficiency_scores),
+        ):
+            value = report.get(source)
+            if isinstance(value, (int, float)):
+                bucket.append(float(value))
+    return {
+        "path": str(path),
+        "run_id": payload.get("run_id"),
+        "model_name": payload.get("model_name"),
+        "mean_score": payload.get("mean_score"),
+        "raw_mean_score": payload.get("raw_mean_score"),
+        "strict_baseline_scoring": payload.get("strict_baseline_scoring"),
+        "task_scores": task_scores,
+        "fallback_count": fallback_count,
+        "parse_failure_count": parse_failure_count,
+        "request_error_count": request_error_count,
+        "avg_resolution_score": _safe_mean(resolution_scores),
+        "avg_evidence_score": _safe_mean(evidence_scores),
+        "avg_documentation_score": _safe_mean(documentation_scores),
+        "avg_efficiency_score": _safe_mean(efficiency_scores),
+        "avg_steps_used": _safe_mean(steps),
+        "avg_reward_trace_len": _safe_mean(reward_lengths),
+    }
+def print_summary(summary: dict[str, Any]) -> None:
+    print(f"path: {summary['path']}")
+    print(f"run_id: {summary['run_id']}")
+    print(f"model: {summary['model_name']}")
+    print(
+        "mean_score: "
+        f"{summary['mean_score']:.4f} "
+        f"(raw_mean_score={summary['raw_mean_score']:.4f}, "
+        f"strict_baseline_scoring={summary['strict_baseline_scoring']})"
+    )
+    print("tasks:")
+    for task_id in TASK_ORDER:
+        score = summary["task_scores"].get(task_id)
+        rendered = "-" if score is None else f"{score:.4f}"
+        print(f"  {task_id}: {rendered}")
+    print("components:")
+    for label in (
+        "avg_resolution_score",
+        "avg_evidence_score",
+        "avg_documentation_score",
+        "avg_efficiency_score",
+        "avg_steps_used",
+        "avg_reward_trace_len",
+    ):
+        value = summary[label]
+        rendered = "-" if value is None else f"{value:.4f}"
+        print(f"  {label}: {rendered}")
+    print("health:")
+    print(f"  fallbacks: {summary['fallback_count']}")
+    print(f"  parse_failures: {summary['parse_failure_count']}")
+    print(f"  request_errors: {summary['request_error_count']}")
+def main() -> None:
+    args = parse_args()
+    paths = [Path(value) for value in args.paths] if args.paths else [find_latest_eval()]
+    for index, path in enumerate(paths):
+        if index:
+            print()
+        print_summary(summarize_eval(path))
+if __name__ == "__main__":
+    main()

tests/conftest.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import sys
+from pathlib import Path
+ROOT = Path(__file__).resolve().parents[1]
+PARENT = ROOT.parent
+if str(PARENT) not in sys.path:
+    sys.path.insert(0, str(PARENT))
+for module_name in list(sys.modules):
+    if module_name == "invoiceops_env" or module_name.startswith("invoiceops_env."):
+        sys.modules.pop(module_name, None)

tests/test_baseline_smoke.py ADDED Viewed

	@@ -0,0 +1,343 @@

+from invoiceops_env.inference import (
+    API_BASE_URL,
+    ObservationMemory,
+    TASKS,
+    _parse_action_payload,
+    _safe_json_load,
+    build_action_prompt,
+    build_observation_snapshot,
+    resolve_api_key,
+    strict_task_score,
+    update_memory,
+)
+from invoiceops_env.models import (
+    ActionType,
+    Disposition,
+    DuplicateMatchStrategy,
+    InvoiceOpsAction,
+    PaymentRecommendation,
+    ReasonCode,
+)
+from invoiceops_env.server.invoiceops_env_environment import InvoiceOpsEnvironment
+def test_parse_action_payload_salvages_common_shapes() -> None:
+    payload = {
+        "action": "set_line_resolution",
+        "args": {
+            "line": "L1",
+            "decision": "hold",
+            "reason_code": "receipt_not_confirmed",
+            "refs": ["art-history", "EX-RECEIPT-L2"],
+            "route": "receiving",
+        },
+    }
+    action = _parse_action_payload(payload)
+    assert action is not None
+    assert action.action_type is ActionType.SET_LINE_RESOLUTION
+    assert action.line_id == "L1"
+    assert action.disposition is Disposition.HOLD
+    assert action.reason_codes == [ReasonCode.RECEIPT_NOT_CONFIRMED]
+    assert action.evidence_refs == ["art-history", "EX-RECEIPT-L2"]
+def test_parse_action_payload_rejects_missing_required_fields() -> None:
+    payload = {
+        "action_type": "set_header_resolution",
+        "payment_recommendation": "hold_full_invoice",
+        "reason_codes": ["receipt_not_confirmed"],
+    }
+    assert _parse_action_payload(payload) is None
+def test_parse_action_payload_accepts_submit_case() -> None:
+    action = _parse_action_payload({"action_type": "submit_case"})
+    assert action is not None
+    assert action.action_type is ActionType.SUBMIT_CASE
+def test_safe_json_load_strips_think_blocks() -> None:
+    payload = _safe_json_load(
+        '<think>reasoning here</think>{"action_type":"submit_case"}'
+    )
+    assert payload == {"action_type": "submit_case"}
+def test_initial_snapshot_does_not_preload_case_details() -> None:
+    env = InvoiceOpsEnvironment()
+    observation = env.reset(task_id="hard")
+    memory = ObservationMemory()
+    snapshot = build_observation_snapshot(observation, memory)
+    assert snapshot["artifacts"] == {}
+    assert snapshot["exceptions"] == []
+    assert snapshot["duplicate_candidates"] == []
+    assert snapshot["known_refs"] == []
+def test_snapshot_only_contains_explicitly_observed_details() -> None:
+    env = InvoiceOpsEnvironment()
+    observation = env.reset(task_id="medium")
+    memory = ObservationMemory()
+    update_memory(memory, observation)
+    observation = env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.OPEN_ARTIFACT,
+            artifact_id="art-invoice",
+        )
+    )
+    update_memory(memory, observation)
+    first_exception = observation.visible_exceptions[0]
+    observation = env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.INSPECT_EXCEPTION,
+            exception_id=first_exception.exception_id,
+        )
+    )
+    update_memory(memory, observation)
+    snapshot = build_observation_snapshot(observation, memory)
+    assert set(snapshot["artifacts"]) == {"invoice_packet"}
+    assert [exception["type"] for exception in snapshot["exceptions"]] == [
+        "possible_duplicate"
+    ]
+    assert observation.known_refs == ["EX-POSSIBLE-DUP", "art-invoice"]
+def test_visible_exception_stubs_hide_detailed_facts_until_inspection() -> None:
+    env = InvoiceOpsEnvironment()
+    observation = env.reset(task_id="medium")
+    stub = observation.visible_exceptions[0]
+    assert stub.headline == "Potential duplicate invoice requires review"
+    assert stub.short_description == (
+        "Inspect this exception for duplicate-match details before deciding."
+    )
+    inspected = env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.INSPECT_EXCEPTION,
+            exception_id=stub.exception_id,
+        )
+    ).inspected_exception
+    assert inspected is not None
+    assert inspected.headline == "Duplicate control is open for this invoice"
+    assert any(
+        field.label == "Invoice number" and field.value == "TL-9205/A"
+        for field in inspected.fields
+    )
+def test_hard_exceptions_do_not_expose_derived_answer_fields() -> None:
+    env = InvoiceOpsEnvironment()
+    env.reset(task_id="hard")
+    l1 = env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.INSPECT_EXCEPTION,
+            exception_id="EX-RECEIPT-L1",
+        )
+    ).inspected_exception
+    l2 = env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.INSPECT_EXCEPTION,
+            exception_id="EX-RECEIPT-L2",
+        )
+    ).inspected_exception
+    assert l1 is not None
+    assert l2 is not None
+    assert {field.label for field in l1.fields} == {
+        "Invoice quantity",
+        "Received quantity",
+        "Short quantity",
+    }
+    assert {field.label for field in l2.fields} == {
+        "Invoice quantity",
+        "Initial posted receipt",
+        "Latest control update",
+    }
+def test_hard_receipt_log_points_to_history_instead_of_reversal_answer() -> None:
+    env = InvoiceOpsEnvironment()
+    observation = env.reset(task_id="hard")
+    observation = env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.OPEN_ARTIFACT,
+            artifact_id="art-receipts",
+        )
+    )
+    opened = observation.opened_artifact
+    assert opened is not None
+    l2 = next(item for item in opened.line_items if item.line_id == "L2")
+    assert l2.status == "received_under_review"
+    assert "history" in l2.notes.lower()
+    assert "reversed" not in l2.notes.lower()
+def test_medium_plus_exception_does_not_expose_unsupported_amount() -> None:
+    env = InvoiceOpsEnvironment()
+    env.reset(task_id="medium_plus")
+    inspected = env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.INSPECT_EXCEPTION,
+            exception_id="EX-RECEIPT-L2",
+        )
+    ).inspected_exception
+    assert inspected is not None
+    assert {field.label for field in inspected.fields} == {
+        "Invoice quantity",
+        "Received quantity",
+        "Short quantity",
+    }
+def test_action_prompt_describes_single_step_agent_loop() -> None:
+    env = InvoiceOpsEnvironment()
+    observation = env.reset(task_id="medium")
+    prompt = build_action_prompt(observation, ObservationMemory())
+    assert "Return exactly one JSON object for the single best next action." in prompt
+    assert "Use open_artifact, inspect_exception, and run_duplicate_check" in prompt
+    assert "next owner or follow-up queue" in prompt
+    assert "A real case-level blocker can justify hold_full_invoice" in prompt
+    assert "Allowed match_strategy values" in prompt
+    assert "Action JSON templates" in prompt
+    assert "<artifact_id>" in prompt
+    assert '"match_strategy":"normalized_invoice_no"' in prompt
+    assert '"action_type":"submit_case"' in prompt
+    assert "If any line is approved, use release_approved_lines" not in prompt
+def test_hf_router_configuration_requires_hf_token(monkeypatch) -> None:
+    monkeypatch.setenv("HF_TOKEN", "hf-secret")
+    monkeypatch.delenv("OPENAI_API_KEY", raising=False)
+    monkeypatch.delenv("API_KEY", raising=False)
+    monkeypatch.delenv("OPENROUTER_API_KEY", raising=False)
+    monkeypatch.delenv("API_BASE_URL", raising=False)
+    api_key, source = resolve_api_key()
+    assert api_key == "hf-secret"
+    assert source == "HF_TOKEN"
+    assert API_BASE_URL == "https://router.huggingface.co/v1"
+def test_invoice_baseline_defaults_to_strict_scoring(monkeypatch) -> None:
+    monkeypatch.delenv("STRICT_BASELINE_SCORING", raising=False)
+    assert strict_task_score(0.2136, used_fallback=True) == 0.0
+    assert strict_task_score(0.2136, used_fallback=False) == 0.2136
+def test_public_task_loop_uses_four_task_progression() -> None:
+    assert [task.value for task in TASKS] == [
+        "easy",
+        "medium",
+        "medium_plus",
+        "hard",
+    ]
+def test_duplicate_check_exposes_strategy_and_candidate_refs() -> None:
+    env = InvoiceOpsEnvironment()
+    observation = env.reset(task_id="medium")
+    memory = ObservationMemory()
+    update_memory(memory, observation)
+    observation = env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.RUN_DUPLICATE_CHECK,
+            match_strategy=DuplicateMatchStrategy.NORMALIZED_INVOICE_NUMBER,
+        )
+    )
+    update_memory(memory, observation)
+    snapshot = build_observation_snapshot(observation, memory)
+    assert "duplicate_check:normalized_invoice_no" in observation.known_refs
+    assert "CAND-NORM-01" in observation.known_refs
+    assert snapshot["duplicate_candidates"] == [
+        {
+            "candidate_id": "CAND-NORM-01",
+            "invoice_number": "TL9205A",
+            "invoice_date": "2026-03-10",
+            "gross_amount": 3800.0,
+            "status": "reversed on 2026-03-11 after import duplicate; closed",
+            "match_basis": "Normalized invoice number + vendor + gross amount",
+            "overlap_summary": "Same normalized invoice number. Prior record was reversed before payment.",
+        }
+    ]
+def test_duplicate_check_action_parses() -> None:
+    action = _parse_action_payload(
+        {
+            "action_type": "run_duplicate_check",
+            "match_strategy": DuplicateMatchStrategy.NORMALIZED_INVOICE_NUMBER.value,
+        }
+    )
+    assert action is not None
+    assert action.action_type is ActionType.RUN_DUPLICATE_CHECK
+    assert (
+        action.match_strategy
+        is DuplicateMatchStrategy.NORMALIZED_INVOICE_NUMBER
+    )
+def test_duplicate_check_action_parses_common_aliases() -> None:
+    action = _parse_action_payload(
+        {
+            "action_type": "run_duplicate_check",
+            "match_strategy": "vendor_invoice_amount",
+        }
+    )
+    assert action is not None
+    assert action.action_type is ActionType.RUN_DUPLICATE_CHECK
+    assert action.match_strategy is DuplicateMatchStrategy.VENDOR_AMOUNT_DATE
+def test_header_resolution_action_parses_common_aliases() -> None:
+    action = _parse_action_payload(
+        {
+            "action_type": "set_header_resolution",
+            "recommendation": PaymentRecommendation.HOLD_FULL_INVOICE.value,
+            "reason_code": [
+                ReasonCode.NON_PO_APPROVAL_MISSING.value,
+                ReasonCode.POSSIBLE_DUPLICATE_REVIEW.value,
+            ],
+            "refs": ["art-approval", "duplicate_check:normalized_invoice_no"],
+            "route": "requester",
+        }
+    )
+    assert action is not None
+    assert action.action_type is ActionType.SET_HEADER_RESOLUTION
+    assert (
+        action.payment_recommendation
+        is PaymentRecommendation.HOLD_FULL_INVOICE
+    )
+    assert action.reason_codes == [
+        ReasonCode.NON_PO_APPROVAL_MISSING,
+        ReasonCode.POSSIBLE_DUPLICATE_REVIEW,
+    ]
+    assert action.evidence_refs == [
+        "art-approval",
+        "duplicate_check:normalized_invoice_no",
+    ]

tests/test_env_flow.py ADDED Viewed

	@@ -0,0 +1,640 @@

+"""End-to-end environment flow tests for the 4-task InvoiceOps ladder."""
+from invoiceops_env.models import (
+    ActionType,
+    Disposition,
+    InvoiceOpsAction,
+    NoteType,
+    PaymentRecommendation,
+    ReasonCode,
+)
+from invoiceops_env.server.invoiceops_env_environment import InvoiceOpsEnvironment
+from invoiceops_env.server.scenario_loader import load_scenario
+def _run_easy_perfect_case() -> float:
+    env = InvoiceOpsEnvironment()
+    env.reset(task_id="easy")
+    env.step(
+        InvoiceOpsAction(action_type=ActionType.OPEN_ARTIFACT, artifact_id="art-invoice")
+    )
+    env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.OPEN_ARTIFACT, artifact_id="art-approval"
+        )
+    )
+    env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.INSPECT_EXCEPTION,
+            exception_id="EX-NONPO-APPROVAL",
+        )
+    )
+    env.step(
+        InvoiceOpsAction(action_type=ActionType.OPEN_ARTIFACT, artifact_id="art-policy")
+    )
+    env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.ADD_NOTE,
+            note_type=NoteType.ISSUE_SUMMARY,
+            reason_codes=[ReasonCode.NON_PO_APPROVAL_MISSING],
+            evidence_refs=["art-approval", "art-policy"],
+            text="Approval workflow is not initiated and the requester must start approval before payment can release.",
+        )
+    )
+    env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.SET_LINE_RESOLUTION,
+            line_id="L1",
+            disposition=Disposition.HOLD,
+            reason_codes=[ReasonCode.NON_PO_APPROVAL_MISSING],
+            evidence_refs=[
+                "art-invoice",
+                "art-approval",
+                "art-policy",
+                "EX-NONPO-APPROVAL",
+            ],
+            route_to="requester",
+        )
+    )
+    env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.SET_HEADER_RESOLUTION,
+            payment_recommendation=PaymentRecommendation.HOLD_FULL_INVOICE,
+            reason_codes=[ReasonCode.NON_PO_APPROVAL_MISSING],
+            evidence_refs=[
+                "art-invoice",
+                "art-approval",
+                "art-policy",
+                "EX-NONPO-APPROVAL",
+            ],
+            route_to="requester",
+        )
+    )
+    result = env.step(InvoiceOpsAction(action_type=ActionType.SUBMIT_CASE))
+    assert result.done is True
+    return float(result.episode_score or 0.0)
+def _run_medium_perfect_case() -> float:
+    env = InvoiceOpsEnvironment()
+    env.reset(task_id="medium")
+    env.step(InvoiceOpsAction(action_type=ActionType.OPEN_ARTIFACT, artifact_id="art-po"))
+    env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.OPEN_ARTIFACT, artifact_id="art-receipts"
+        )
+    )
+    env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.INSPECT_EXCEPTION,
+            exception_id="EX-POSSIBLE-DUP",
+        )
+    )
+    env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.RUN_DUPLICATE_CHECK,
+            match_strategy="normalized_invoice_no",
+        )
+    )
+    env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.ADD_NOTE,
+            note_type=NoteType.REVIEW_SUMMARY,
+            reason_codes=[
+                ReasonCode.POSSIBLE_DUPLICATE_REVIEW,
+                ReasonCode.SAFE_TO_PAY,
+            ],
+            evidence_refs=["duplicate_check:normalized_invoice_no", "CAND-NORM-01"],
+            text="The normalized duplicate hit is a reversed prior record, so the invoice can release.",
+        )
+    )
+    env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.SET_LINE_RESOLUTION,
+            line_id="L1",
+            disposition=Disposition.APPROVE,
+            reason_codes=[
+                ReasonCode.MATCHED_TO_PO_AND_RECEIPT,
+                ReasonCode.SAFE_TO_PAY,
+            ],
+            evidence_refs=[
+                "art-po",
+                "art-receipts",
+                "EX-POSSIBLE-DUP",
+                "duplicate_check:normalized_invoice_no",
+                "CAND-NORM-01",
+            ],
+        )
+    )
+    env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.SET_LINE_RESOLUTION,
+            line_id="L2",
+            disposition=Disposition.APPROVE,
+            reason_codes=[
+                ReasonCode.MATCHED_TO_PO_AND_RECEIPT,
+                ReasonCode.SAFE_TO_PAY,
+            ],
+            evidence_refs=[
+                "art-po",
+                "art-receipts",
+                "EX-POSSIBLE-DUP",
+                "duplicate_check:normalized_invoice_no",
+                "CAND-NORM-01",
+            ],
+        )
+    )
+    env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.SET_HEADER_RESOLUTION,
+            payment_recommendation=PaymentRecommendation.RELEASE_APPROVED_LINES,
+            reason_codes=[
+                ReasonCode.POSSIBLE_DUPLICATE_REVIEW,
+                ReasonCode.SAFE_TO_PAY,
+            ],
+            evidence_refs=[
+                "art-po",
+                "art-receipts",
+                "EX-POSSIBLE-DUP",
+                "duplicate_check:normalized_invoice_no",
+                "CAND-NORM-01",
+            ],
+        )
+    )
+    result = env.step(InvoiceOpsAction(action_type=ActionType.SUBMIT_CASE))
+    assert result.done is True
+    return float(result.episode_score or 0.0)
+def _run_medium_plus_perfect_case() -> float:
+    env = InvoiceOpsEnvironment()
+    env.reset(task_id="medium_plus")
+    env.step(
+        InvoiceOpsAction(action_type=ActionType.OPEN_ARTIFACT, artifact_id="art-invoice")
+    )
+    env.step(InvoiceOpsAction(action_type=ActionType.OPEN_ARTIFACT, artifact_id="art-po"))
+    env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.OPEN_ARTIFACT, artifact_id="art-receipts"
+        )
+    )
+    env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.INSPECT_EXCEPTION,
+            exception_id="EX-POSSIBLE-DUP",
+        )
+    )
+    env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.RUN_DUPLICATE_CHECK,
+            match_strategy="normalized_invoice_no",
+        )
+    )
+    env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.INSPECT_EXCEPTION,
+            exception_id="EX-RECEIPT-L2",
+        )
+    )
+    env.step(
+        InvoiceOpsAction(action_type=ActionType.OPEN_ARTIFACT, artifact_id="art-policy")
+    )
+    env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.ADD_NOTE,
+            note_type=NoteType.REVIEW_SUMMARY,
+            reason_codes=[
+                ReasonCode.POSSIBLE_DUPLICATE_REVIEW,
+                ReasonCode.SAFE_TO_PAY,
+            ],
+            evidence_refs=["duplicate_check:normalized_invoice_no", "CAND-NORM-01"],
+            text="The normalized duplicate hit is a reversed prior record, so duplicate review is cleared.",
+        )
+    )
+    env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.ADD_NOTE,
+            note_type=NoteType.ISSUE_SUMMARY,
+            reason_codes=[ReasonCode.RECEIPT_NOT_CONFIRMED],
+            evidence_refs=[
+                "art-invoice",
+                "art-receipts",
+                "art-policy",
+                "EX-RECEIPT-L2",
+            ],
+            text="L2 remains blocked because the unsupported amount exceeds the de minimis receipt threshold.",
+        )
+    )
+    env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.SET_LINE_RESOLUTION,
+            line_id="L1",
+            disposition=Disposition.APPROVE,
+            reason_codes=[
+                ReasonCode.MATCHED_TO_PO_AND_RECEIPT,
+                ReasonCode.SAFE_TO_PAY,
+            ],
+            evidence_refs=[
+                "art-po",
+                "art-receipts",
+                "EX-POSSIBLE-DUP",
+                "duplicate_check:normalized_invoice_no",
+                "CAND-NORM-01",
+            ],
+        )
+    )
+    env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.SET_LINE_RESOLUTION,
+            line_id="L2",
+            disposition=Disposition.HOLD,
+            reason_codes=[ReasonCode.RECEIPT_NOT_CONFIRMED],
+            evidence_refs=[
+                "art-invoice",
+                "art-receipts",
+                "art-policy",
+                "EX-RECEIPT-L2",
+            ],
+            route_to="receiving",
+        )
+    )
+    env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.SET_HEADER_RESOLUTION,
+            payment_recommendation=PaymentRecommendation.RELEASE_APPROVED_LINES,
+            reason_codes=[
+                ReasonCode.POSSIBLE_DUPLICATE_REVIEW,
+                ReasonCode.RECEIPT_NOT_CONFIRMED,
+                ReasonCode.SAFE_TO_PAY,
+            ],
+            evidence_refs=[
+                "art-policy",
+                "art-receipts",
+                "EX-RECEIPT-L2",
+                "duplicate_check:normalized_invoice_no",
+                "CAND-NORM-01",
+            ],
+        )
+    )
+    result = env.step(InvoiceOpsAction(action_type=ActionType.SUBMIT_CASE))
+    assert result.done is True
+    return float(result.episode_score or 0.0)
+def _run_hard_perfect_case() -> float:
+    env = InvoiceOpsEnvironment()
+    env.reset(task_id="hard")
+    env.step(
+        InvoiceOpsAction(action_type=ActionType.OPEN_ARTIFACT, artifact_id="art-invoice")
+    )
+    env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.INSPECT_EXCEPTION,
+            exception_id="EX-POSSIBLE-DUP",
+        )
+    )
+    env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.RUN_DUPLICATE_CHECK,
+            match_strategy="normalized_invoice_no",
+        )
+    )
+    env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.OPEN_ARTIFACT, artifact_id="art-receipts"
+        )
+    )
+    env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.INSPECT_EXCEPTION,
+            exception_id="EX-RECEIPT-L1",
+        )
+    )
+    env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.INSPECT_EXCEPTION,
+            exception_id="EX-RECEIPT-L2",
+        )
+    )
+    env.step(
+        InvoiceOpsAction(action_type=ActionType.OPEN_ARTIFACT, artifact_id="art-history")
+    )
+    env.step(
+        InvoiceOpsAction(action_type=ActionType.OPEN_ARTIFACT, artifact_id="art-vendor")
+    )
+    env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.INSPECT_EXCEPTION,
+            exception_id="EX-TAX-001",
+        )
+    )
+    env.step(
+        InvoiceOpsAction(action_type=ActionType.OPEN_ARTIFACT, artifact_id="art-policy")
+    )
+    env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.ADD_NOTE,
+            note_type=NoteType.REVIEW_SUMMARY,
+            reason_codes=[
+                ReasonCode.POSSIBLE_DUPLICATE_REVIEW,
+                ReasonCode.SAFE_TO_PAY,
+            ],
+            evidence_refs=["duplicate_check:normalized_invoice_no", "CAND-NORM-01"],
+            text="The normalized duplicate hit is a reversed prior record, so the duplicate control is cleared.",
+        )
+    )
+    env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.ADD_NOTE,
+            note_type=NoteType.ISSUE_SUMMARY,
+            reason_codes=[ReasonCode.RECEIPT_NOT_CONFIRMED],
+            evidence_refs=["art-history", "EX-RECEIPT-L2"],
+            text="L2 remains blocked because the latest receiving history shows an open damage hold.",
+        )
+    )
+    env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.ADD_NOTE,
+            note_type=NoteType.ESCALATION_REQUEST,
+            reason_codes=[ReasonCode.TAX_AMOUNT_MISMATCH],
+            evidence_refs=["art-vendor", "art-policy", "EX-TAX-001"],
+            text="The project is tax exempt, so payment must remain blocked pending Tax Ops review.",
+        )
+    )
+    env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.SET_LINE_RESOLUTION,
+            line_id="L1",
+            disposition=Disposition.APPROVE,
+            reason_codes=[
+                ReasonCode.PARTIAL_RECEIPT_PENDING,
+                ReasonCode.SAFE_TO_PAY,
+            ],
+            evidence_refs=[
+                "art-invoice",
+                "art-receipts",
+                "EX-RECEIPT-L1",
+                "art-policy",
+                "duplicate_check:normalized_invoice_no",
+                "CAND-NORM-01",
+            ],
+        )
+    )
+    env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.SET_LINE_RESOLUTION,
+            line_id="L2",
+            disposition=Disposition.HOLD,
+            reason_codes=[ReasonCode.RECEIPT_NOT_CONFIRMED],
+            evidence_refs=[
+                "art-receipts",
+                "art-history",
+                "EX-RECEIPT-L2",
+                "art-policy",
+            ],
+            route_to="receiving",
+        )
+    )
+    env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.SET_LINE_RESOLUTION,
+            line_id="L3",
+            disposition=Disposition.APPROVE,
+            reason_codes=[
+                ReasonCode.MATCHED_TO_PO_AND_RECEIPT,
+                ReasonCode.SAFE_TO_PAY,
+            ],
+            evidence_refs=[
+                "art-invoice",
+                "art-receipts",
+                "duplicate_check:normalized_invoice_no",
+                "CAND-NORM-01",
+            ],
+        )
+    )
+    env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.SET_HEADER_RESOLUTION,
+            payment_recommendation=PaymentRecommendation.HOLD_FULL_INVOICE,
+            reason_codes=[ReasonCode.TAX_AMOUNT_MISMATCH],
+            evidence_refs=[
+                "art-invoice",
+                "art-vendor",
+                "art-policy",
+                "EX-TAX-001",
+            ],
+            route_to="tax",
+        )
+    )
+    result = env.step(InvoiceOpsAction(action_type=ActionType.SUBMIT_CASE))
+    assert result.done is True
+    return float(result.episode_score or 0.0)
+def test_perfect_cases_score_near_one() -> None:
+    assert _run_easy_perfect_case() >= 0.99
+    assert _run_medium_perfect_case() >= 0.99
+    assert _run_medium_plus_perfect_case() >= 0.99
+    assert _run_hard_perfect_case() >= 0.99
+def test_tax_hold_can_coexist_with_approved_lines() -> None:
+    env = InvoiceOpsEnvironment()
+    env.reset(task_id="hard")
+    env.step(
+        InvoiceOpsAction(action_type=ActionType.OPEN_ARTIFACT, artifact_id="art-invoice")
+    )
+    env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.RUN_DUPLICATE_CHECK,
+            match_strategy="normalized_invoice_no",
+        )
+    )
+    env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.OPEN_ARTIFACT, artifact_id="art-receipts"
+        )
+    )
+    env.step(
+        InvoiceOpsAction(action_type=ActionType.OPEN_ARTIFACT, artifact_id="art-vendor")
+    )
+    env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.INSPECT_EXCEPTION,
+            exception_id="EX-RECEIPT-L2",
+        )
+    )
+    env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.INSPECT_EXCEPTION,
+            exception_id="EX-TAX-001",
+        )
+    )
+    env.step(
+        InvoiceOpsAction(action_type=ActionType.OPEN_ARTIFACT, artifact_id="art-history")
+    )
+    env.step(
+        InvoiceOpsAction(action_type=ActionType.OPEN_ARTIFACT, artifact_id="art-policy")
+    )
+    env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.SET_LINE_RESOLUTION,
+            line_id="L1",
+            disposition=Disposition.APPROVE,
+            reason_codes=[ReasonCode.SAFE_TO_PAY],
+            evidence_refs=[
+                "art-invoice",
+                "art-receipts",
+                "duplicate_check:normalized_invoice_no",
+                "CAND-NORM-01",
+                "EX-RECEIPT-L1",
+            ],
+        )
+    )
+    env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.SET_LINE_RESOLUTION,
+            line_id="L2",
+            disposition=Disposition.HOLD,
+            reason_codes=[ReasonCode.RECEIPT_NOT_CONFIRMED],
+            evidence_refs=["art-history", "EX-RECEIPT-L2"],
+            route_to="receiving",
+        )
+    )
+    env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.SET_LINE_RESOLUTION,
+            line_id="L3",
+            disposition=Disposition.APPROVE,
+            reason_codes=[ReasonCode.SAFE_TO_PAY],
+            evidence_refs=[
+                "art-invoice",
+                "art-receipts",
+                "duplicate_check:normalized_invoice_no",
+                "CAND-NORM-01",
+            ],
+        )
+    )
+    result = env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.SET_HEADER_RESOLUTION,
+            payment_recommendation=PaymentRecommendation.HOLD_FULL_INVOICE,
+            reason_codes=[ReasonCode.TAX_AMOUNT_MISMATCH],
+            evidence_refs=[
+                "art-invoice",
+                "art-vendor",
+                "art-policy",
+                "EX-TAX-001",
+            ],
+            route_to="tax",
+        )
+    )
+    assert result.done is False
+    assert result.message == "Saved header recommendation."
+def test_release_approved_lines_can_coexist_with_held_lines() -> None:
+    env = InvoiceOpsEnvironment()
+    env.reset(task_id="medium_plus")
+    env.step(
+        InvoiceOpsAction(action_type=ActionType.OPEN_ARTIFACT, artifact_id="art-po")
+    )
+    env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.OPEN_ARTIFACT, artifact_id="art-receipts"
+        )
+    )
+    env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.RUN_DUPLICATE_CHECK,
+            match_strategy="normalized_invoice_no",
+        )
+    )
+    env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.INSPECT_EXCEPTION,
+            exception_id="EX-RECEIPT-L2",
+        )
+    )
+    env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.SET_LINE_RESOLUTION,
+            line_id="L1",
+            disposition=Disposition.APPROVE,
+            reason_codes=[ReasonCode.SAFE_TO_PAY],
+            evidence_refs=[
+                "art-po",
+                "art-receipts",
+                "duplicate_check:normalized_invoice_no",
+                "CAND-NORM-01",
+            ],
+        )
+    )
+    env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.SET_LINE_RESOLUTION,
+            line_id="L2",
+            disposition=Disposition.HOLD,
+            reason_codes=[ReasonCode.RECEIPT_NOT_CONFIRMED],
+            evidence_refs=["art-receipts", "EX-RECEIPT-L2"],
+            route_to="receiving",
+        )
+    )
+    result = env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.SET_HEADER_RESOLUTION,
+            payment_recommendation=PaymentRecommendation.RELEASE_APPROVED_LINES,
+            reason_codes=[ReasonCode.SAFE_TO_PAY],
+            evidence_refs=[
+                "art-receipts",
+                "duplicate_check:normalized_invoice_no",
+                "CAND-NORM-01",
+            ],
+        )
+    )
+    assert result.done is False
+    assert result.message == "Saved header recommendation."
+def test_release_approved_lines_without_approved_lines_is_invalid() -> None:
+    env = InvoiceOpsEnvironment()
+    env.reset(task_id="easy")
+    env.step(
+        InvoiceOpsAction(action_type=ActionType.OPEN_ARTIFACT, artifact_id="art-policy")
+    )
+    result = env.step(
+        InvoiceOpsAction(
+            action_type=ActionType.SET_HEADER_RESOLUTION,
+            payment_recommendation=PaymentRecommendation.RELEASE_APPROVED_LINES,
+            reason_codes=[ReasonCode.SAFE_TO_PAY],
+            evidence_refs=["art-policy"],
+        )
+    )
+    assert result.done is False
+    submit_result = env.step(InvoiceOpsAction(action_type=ActionType.SUBMIT_CASE))
+    assert submit_result.done is False
+    assert "requires at least one approved line" in submit_result.message
+def test_hard_budget_has_recovery_slack() -> None:
+    scenario = load_scenario(task_id="hard")
+    assert scenario.step_limit - scenario.hidden_truth.efficient_step_target >= 5
+def test_medium_plus_budget_has_recovery_slack() -> None:
+    scenario = load_scenario(task_id="medium_plus")
+    assert scenario.step_limit - scenario.hidden_truth.efficient_step_target >= 4

tests/test_grader.py ADDED Viewed

	@@ -0,0 +1,1029 @@

+"""Grader discrimination tests for the 4-task InvoiceOps benchmark."""
+from invoiceops_env.models import (
+    DecisionBand,
+    Disposition,
+    HeaderResolution,
+    LineResolution,
+    PaymentRecommendation,
+    ReasonCode,
+    RouteTarget,
+)
+from invoiceops_env.server.grader import ReviewTrace, grade_case
+from invoiceops_env.server.scenario_loader import load_scenario
+def test_easy_single_line_header_fallback_rewards_correct_route() -> None:
+    scenario = load_scenario(task_id="easy")
+    trace = ReviewTrace(
+        ref_steps={
+            "art-invoice": 1,
+            "art-approval": 2,
+            "EX-NONPO-APPROVAL": 3,
+            "art-policy": 4,
+        },
+        steps_used=6,
+    )
+    correct_header_only = grade_case(
+        scenario,
+        line_resolutions={},
+        header_resolution=HeaderResolution(
+            resolution_id="HR-001",
+            payment_recommendation=PaymentRecommendation.HOLD_FULL_INVOICE,
+            reason_codes=[ReasonCode.NON_PO_APPROVAL_MISSING],
+            evidence_refs=[
+                "art-invoice",
+                "art-approval",
+                "art-policy",
+                "EX-NONPO-APPROVAL",
+            ],
+            route_to=RouteTarget.REQUESTER,
+            saved_at_step=5,
+        ),
+        notes={},
+        trace=trace,
+    )
+    wrong_route_header_only = grade_case(
+        scenario,
+        line_resolutions={},
+        header_resolution=HeaderResolution(
+            resolution_id="HR-001",
+            payment_recommendation=PaymentRecommendation.HOLD_FULL_INVOICE,
+            reason_codes=[ReasonCode.NON_PO_APPROVAL_MISSING],
+            evidence_refs=[
+                "art-invoice",
+                "art-approval",
+                "art-policy",
+                "EX-NONPO-APPROVAL",
+            ],
+            route_to=RouteTarget.AP_MANAGER,
+            saved_at_step=5,
+        ),
+        notes={},
+        trace=trace,
+    )
+    assert correct_header_only.decision_band is DecisionBand.BEST
+    assert wrong_route_header_only.decision_band is DecisionBand.WRONG
+    assert correct_header_only.total_score > 0.95
+    assert wrong_route_header_only.total_score < 0.30
+def test_medium_duplicate_evidence_creates_best_safe_and_wrong_bands() -> None:
+    scenario = load_scenario(task_id="medium")
+    best_trace = ReviewTrace(
+        ref_steps={
+            "art-po": 1,
+            "art-receipts": 2,
+            "EX-POSSIBLE-DUP": 3,
+            "duplicate_check:normalized_invoice_no": 4,
+            "CAND-NORM-01": 4,
+        },
+        steps_used=8,
+    )
+    best_report = grade_case(
+        scenario,
+        line_resolutions={
+            "L1": LineResolution(
+                resolution_id="LR-L1",
+                line_id="L1",
+                disposition=Disposition.APPROVE,
+                reason_codes=[
+                    ReasonCode.MATCHED_TO_PO_AND_RECEIPT,
+                    ReasonCode.SAFE_TO_PAY,
+                ],
+                evidence_refs=[
+                    "art-po",
+                    "art-receipts",
+                    "EX-POSSIBLE-DUP",
+                    "duplicate_check:normalized_invoice_no",
+                    "CAND-NORM-01",
+                ],
+                route_to=None,
+                saved_at_step=5,
+            ),
+            "L2": LineResolution(
+                resolution_id="LR-L2",
+                line_id="L2",
+                disposition=Disposition.APPROVE,
+                reason_codes=[
+                    ReasonCode.MATCHED_TO_PO_AND_RECEIPT,
+                    ReasonCode.SAFE_TO_PAY,
+                ],
+                evidence_refs=[
+                    "art-po",
+                    "art-receipts",
+                    "EX-POSSIBLE-DUP",
+                    "duplicate_check:normalized_invoice_no",
+                    "CAND-NORM-01",
+                ],
+                route_to=None,
+                saved_at_step=6,
+            ),
+        },
+        header_resolution=HeaderResolution(
+            resolution_id="HR-001",
+            payment_recommendation=PaymentRecommendation.RELEASE_APPROVED_LINES,
+            reason_codes=[
+                ReasonCode.POSSIBLE_DUPLICATE_REVIEW,
+                ReasonCode.SAFE_TO_PAY,
+            ],
+            evidence_refs=[
+                "art-po",
+                "art-receipts",
+                "EX-POSSIBLE-DUP",
+                "duplicate_check:normalized_invoice_no",
+                "CAND-NORM-01",
+            ],
+            route_to=None,
+            saved_at_step=7,
+        ),
+        notes={},
+        trace=best_trace,
+    )
+    safe_hold_report = grade_case(
+        scenario,
+        line_resolutions={
+            "L1": LineResolution(
+                resolution_id="LR-L1",
+                line_id="L1",
+                disposition=Disposition.HOLD,
+                reason_codes=[ReasonCode.POSSIBLE_DUPLICATE_REVIEW],
+                evidence_refs=[
+                    "EX-POSSIBLE-DUP",
+                    "duplicate_check:normalized_invoice_no",
+                    "CAND-NORM-01",
+                ],
+                route_to=None,
+                saved_at_step=5,
+            ),
+            "L2": LineResolution(
+                resolution_id="LR-L2",
+                line_id="L2",
+                disposition=Disposition.HOLD,
+                reason_codes=[ReasonCode.POSSIBLE_DUPLICATE_REVIEW],
+                evidence_refs=[
+                    "EX-POSSIBLE-DUP",
+                    "duplicate_check:normalized_invoice_no",
+                    "CAND-NORM-01",
+                ],
+                route_to=None,
+                saved_at_step=6,
+            ),
+        },
+        header_resolution=HeaderResolution(
+            resolution_id="HR-001",
+            payment_recommendation=PaymentRecommendation.HOLD_FULL_INVOICE,
+            reason_codes=[ReasonCode.POSSIBLE_DUPLICATE_REVIEW],
+            evidence_refs=[
+                "EX-POSSIBLE-DUP",
+                "duplicate_check:normalized_invoice_no",
+                "CAND-NORM-01",
+            ],
+            route_to=None,
+            saved_at_step=7,
+        ),
+        notes={},
+        trace=best_trace,
+    )
+    wrong_heuristic_hold = grade_case(
+        scenario,
+        line_resolutions={
+            "L1": LineResolution(
+                resolution_id="LR-L1",
+                line_id="L1",
+                disposition=Disposition.HOLD,
+                reason_codes=[ReasonCode.POSSIBLE_DUPLICATE_REVIEW],
+                evidence_refs=[
+                    "EX-POSSIBLE-DUP",
+                    "duplicate_check:vendor_amount_date",
+                    "CAND-AMT-02",
+                ],
+                route_to=None,
+                saved_at_step=5,
+            ),
+            "L2": LineResolution(
+                resolution_id="LR-L2",
+                line_id="L2",
+                disposition=Disposition.HOLD,
+                reason_codes=[ReasonCode.POSSIBLE_DUPLICATE_REVIEW],
+                evidence_refs=[
+                    "EX-POSSIBLE-DUP",
+                    "duplicate_check:vendor_amount_date",
+                    "CAND-AMT-02",
+                ],
+                route_to=None,
+                saved_at_step=6,
+            ),
+        },
+        header_resolution=HeaderResolution(
+            resolution_id="HR-001",
+            payment_recommendation=PaymentRecommendation.HOLD_FULL_INVOICE,
+            reason_codes=[ReasonCode.POSSIBLE_DUPLICATE_REVIEW],
+            evidence_refs=[
+                "EX-POSSIBLE-DUP",
+                "duplicate_check:vendor_amount_date",
+                "CAND-AMT-02",
+            ],
+            route_to=None,
+            saved_at_step=7,
+        ),
+        notes={},
+        trace=ReviewTrace(
+            ref_steps={
+                "art-po": 1,
+                "art-receipts": 2,
+                "EX-POSSIBLE-DUP": 3,
+                "duplicate_check:vendor_amount_date": 4,
+                "CAND-AMT-02": 4,
+            },
+            steps_used=8,
+        ),
+    )
+    assert best_report.decision_band is DecisionBand.BEST
+    assert safe_hold_report.decision_band is DecisionBand.SAFE_SUBOPTIMAL
+    assert wrong_heuristic_hold.decision_band is DecisionBand.WRONG
+    assert best_report.total_score > safe_hold_report.total_score > wrong_heuristic_hold.total_score
+    assert best_report.total_score > 0.95
+    assert 0.55 < safe_hold_report.total_score < 0.75
+    assert wrong_heuristic_hold.total_score < 0.30
+def test_medium_approval_without_duplicate_clearance_is_wrong() -> None:
+    scenario = load_scenario(task_id="medium")
+    trace = ReviewTrace(
+        ref_steps={
+            "art-po": 1,
+            "art-receipts": 2,
+            "EX-POSSIBLE-DUP": 3,
+        },
+        steps_used=7,
+    )
+    report = grade_case(
+        scenario,
+        line_resolutions={
+            "L1": LineResolution(
+                resolution_id="LR-L1",
+                line_id="L1",
+                disposition=Disposition.APPROVE,
+                reason_codes=[ReasonCode.SAFE_TO_PAY],
+                evidence_refs=["art-po", "art-receipts", "EX-POSSIBLE-DUP"],
+                route_to=None,
+                saved_at_step=4,
+            ),
+            "L2": LineResolution(
+                resolution_id="LR-L2",
+                line_id="L2",
+                disposition=Disposition.APPROVE,
+                reason_codes=[ReasonCode.SAFE_TO_PAY],
+                evidence_refs=["art-po", "art-receipts", "EX-POSSIBLE-DUP"],
+                route_to=None,
+                saved_at_step=5,
+            ),
+        },
+        header_resolution=HeaderResolution(
+            resolution_id="HR-001",
+            payment_recommendation=PaymentRecommendation.RELEASE_APPROVED_LINES,
+            reason_codes=[ReasonCode.SAFE_TO_PAY],
+            evidence_refs=["art-po", "art-receipts", "EX-POSSIBLE-DUP"],
+            route_to=None,
+            saved_at_step=6,
+        ),
+        notes={},
+        trace=trace,
+    )
+    assert report.decision_band is DecisionBand.WRONG
+    assert report.total_score < 0.30
+def test_medium_observed_evidence_counts_even_if_final_refs_are_sparse() -> None:
+    scenario = load_scenario(task_id="medium")
+    trace = ReviewTrace(
+        ref_steps={
+            "art-po": 1,
+            "art-receipts": 2,
+            "EX-POSSIBLE-DUP": 3,
+            "duplicate_check:normalized_invoice_no": 4,
+            "CAND-NORM-01": 4,
+        },
+        steps_used=8,
+    )
+    report = grade_case(
+        scenario,
+        line_resolutions={
+            "L1": LineResolution(
+                resolution_id="LR-L1",
+                line_id="L1",
+                disposition=Disposition.APPROVE,
+                reason_codes=[
+                    ReasonCode.MATCHED_TO_PO_AND_RECEIPT,
+                    ReasonCode.SAFE_TO_PAY,
+                ],
+                evidence_refs=["art-po", "CAND-NORM-01"],
+                route_to=None,
+                saved_at_step=5,
+            ),
+            "L2": LineResolution(
+                resolution_id="LR-L2",
+                line_id="L2",
+                disposition=Disposition.APPROVE,
+                reason_codes=[
+                    ReasonCode.MATCHED_TO_PO_AND_RECEIPT,
+                    ReasonCode.SAFE_TO_PAY,
+                ],
+                evidence_refs=["art-po"],
+                route_to=None,
+                saved_at_step=6,
+            ),
+        },
+        header_resolution=HeaderResolution(
+            resolution_id="HR-001",
+            payment_recommendation=PaymentRecommendation.RELEASE_APPROVED_LINES,
+            reason_codes=[
+                ReasonCode.MATCHED_TO_PO_AND_RECEIPT,
+                ReasonCode.SAFE_TO_PAY,
+            ],
+            evidence_refs=["art-po", "CAND-NORM-01"],
+            route_to=None,
+            saved_at_step=7,
+        ),
+        notes={},
+        trace=trace,
+    )
+    assert report.decision_band is DecisionBand.BEST
+    assert 0.80 < report.total_score < 0.98
+    assert report.evidence_score < 0.70
+def test_medium_duplicate_hit_without_po_and_receipts_stays_wrong() -> None:
+    scenario = load_scenario(task_id="medium")
+    trace = ReviewTrace(
+        ref_steps={
+            "EX-POSSIBLE-DUP": 1,
+            "duplicate_check:normalized_invoice_no": 2,
+            "CAND-NORM-01": 2,
+        },
+        steps_used=6,
+    )
+    report = grade_case(
+        scenario,
+        line_resolutions={
+            "L1": LineResolution(
+                resolution_id="LR-L1",
+                line_id="L1",
+                disposition=Disposition.APPROVE,
+                reason_codes=[ReasonCode.SAFE_TO_PAY],
+                evidence_refs=["CAND-NORM-01"],
+                route_to=None,
+                saved_at_step=3,
+            ),
+            "L2": LineResolution(
+                resolution_id="LR-L2",
+                line_id="L2",
+                disposition=Disposition.APPROVE,
+                reason_codes=[ReasonCode.SAFE_TO_PAY],
+                evidence_refs=["CAND-NORM-01"],
+                route_to=None,
+                saved_at_step=4,
+            ),
+        },
+        header_resolution=HeaderResolution(
+            resolution_id="HR-001",
+            payment_recommendation=PaymentRecommendation.RELEASE_APPROVED_LINES,
+            reason_codes=[ReasonCode.SAFE_TO_PAY],
+            evidence_refs=["CAND-NORM-01"],
+            route_to=None,
+            saved_at_step=5,
+        ),
+        notes={},
+        trace=trace,
+    )
+    assert report.decision_band is DecisionBand.WRONG
+    assert report.total_score < 0.30
+def test_medium_plus_partial_release_creates_best_safe_wrong_and_unsafe_bands() -> None:
+    scenario = load_scenario(task_id="medium_plus")
+    full_trace = ReviewTrace(
+        ref_steps={
+            "art-invoice": 1,
+            "art-po": 2,
+            "art-receipts": 3,
+            "EX-POSSIBLE-DUP": 4,
+            "duplicate_check:normalized_invoice_no": 5,
+            "CAND-NORM-01": 5,
+            "EX-RECEIPT-L2": 6,
+            "art-policy": 7,
+        },
+        steps_used=12,
+    )
+    conservative_trace = ReviewTrace(
+        ref_steps={
+            "art-invoice": 1,
+            "art-po": 2,
+            "art-receipts": 3,
+            "EX-POSSIBLE-DUP": 4,
+            "duplicate_check:normalized_invoice_no": 5,
+            "CAND-NORM-01": 5,
+            "EX-RECEIPT-L2": 6,
+        },
+        steps_used=11,
+    )
+    best_report = grade_case(
+        scenario,
+        line_resolutions={
+            "L1": LineResolution(
+                resolution_id="LR-L1",
+                line_id="L1",
+                disposition=Disposition.APPROVE,
+                reason_codes=[
+                    ReasonCode.MATCHED_TO_PO_AND_RECEIPT,
+                    ReasonCode.SAFE_TO_PAY,
+                ],
+                evidence_refs=[
+                    "art-po",
+                    "art-receipts",
+                    "EX-POSSIBLE-DUP",
+                    "duplicate_check:normalized_invoice_no",
+                    "CAND-NORM-01",
+                ],
+                route_to=None,
+                saved_at_step=8,
+            ),
+            "L2": LineResolution(
+                resolution_id="LR-L2",
+                line_id="L2",
+                disposition=Disposition.HOLD,
+                reason_codes=[ReasonCode.RECEIPT_NOT_CONFIRMED],
+                evidence_refs=[
+                    "art-invoice",
+                    "art-receipts",
+                    "EX-RECEIPT-L2",
+                    "art-policy",
+                ],
+                route_to=RouteTarget.RECEIVING,
+                saved_at_step=9,
+            ),
+        },
+        header_resolution=HeaderResolution(
+            resolution_id="HR-001",
+            payment_recommendation=PaymentRecommendation.RELEASE_APPROVED_LINES,
+            reason_codes=[
+                ReasonCode.POSSIBLE_DUPLICATE_REVIEW,
+                ReasonCode.RECEIPT_NOT_CONFIRMED,
+                ReasonCode.SAFE_TO_PAY,
+            ],
+            evidence_refs=[
+                "art-policy",
+                "art-receipts",
+                "EX-RECEIPT-L2",
+                "duplicate_check:normalized_invoice_no",
+                "CAND-NORM-01",
+            ],
+            route_to=None,
+            saved_at_step=10,
+        ),
+        notes={},
+        trace=full_trace,
+    )
+    safe_report = grade_case(
+        scenario,
+        line_resolutions={
+            "L1": LineResolution(
+                resolution_id="LR-L1",
+                line_id="L1",
+                disposition=Disposition.HOLD,
+                reason_codes=[ReasonCode.POSSIBLE_DUPLICATE_REVIEW],
+                evidence_refs=[
+                    "art-po",
+                    "art-receipts",
+                    "EX-POSSIBLE-DUP",
+                    "duplicate_check:normalized_invoice_no",
+                    "CAND-NORM-01",
+                ],
+                route_to=None,
+                saved_at_step=7,
+            ),
+            "L2": LineResolution(
+                resolution_id="LR-L2",
+                line_id="L2",
+                disposition=Disposition.HOLD,
+                reason_codes=[ReasonCode.RECEIPT_NOT_CONFIRMED],
+                evidence_refs=["art-receipts", "EX-RECEIPT-L2"],
+                route_to=RouteTarget.RECEIVING,
+                saved_at_step=8,
+            ),
+        },
+        header_resolution=HeaderResolution(
+            resolution_id="HR-001",
+            payment_recommendation=PaymentRecommendation.HOLD_FULL_INVOICE,
+            reason_codes=[
+                ReasonCode.POSSIBLE_DUPLICATE_REVIEW,
+                ReasonCode.RECEIPT_NOT_CONFIRMED,
+            ],
+            evidence_refs=[
+                "art-receipts",
+                "EX-RECEIPT-L2",
+                "duplicate_check:normalized_invoice_no",
+                "CAND-NORM-01",
+            ],
+            route_to=None,
+            saved_at_step=9,
+        ),
+        notes={},
+        trace=conservative_trace,
+    )
+    wrong_report = grade_case(
+        scenario,
+        line_resolutions={
+            "L1": LineResolution(
+                resolution_id="LR-L1",
+                line_id="L1",
+                disposition=Disposition.APPROVE,
+                reason_codes=[
+                    ReasonCode.MATCHED_TO_PO_AND_RECEIPT,
+                    ReasonCode.SAFE_TO_PAY,
+                ],
+                evidence_refs=[
+                    "art-po",
+                    "art-receipts",
+                    "EX-POSSIBLE-DUP",
+                    "duplicate_check:normalized_invoice_no",
+                    "CAND-NORM-01",
+                ],
+                route_to=None,
+                saved_at_step=7,
+            ),
+            "L2": LineResolution(
+                resolution_id="LR-L2",
+                line_id="L2",
+                disposition=Disposition.HOLD,
+                reason_codes=[ReasonCode.RECEIPT_NOT_CONFIRMED],
+                evidence_refs=["art-invoice", "art-receipts", "EX-RECEIPT-L2"],
+                route_to=RouteTarget.RECEIVING,
+                saved_at_step=8,
+            ),
+        },
+        header_resolution=HeaderResolution(
+            resolution_id="HR-001",
+            payment_recommendation=PaymentRecommendation.RELEASE_APPROVED_LINES,
+            reason_codes=[
+                ReasonCode.POSSIBLE_DUPLICATE_REVIEW,
+                ReasonCode.RECEIPT_NOT_CONFIRMED,
+                ReasonCode.SAFE_TO_PAY,
+            ],
+            evidence_refs=[
+                "art-receipts",
+                "EX-RECEIPT-L2",
+                "duplicate_check:normalized_invoice_no",
+                "CAND-NORM-01",
+            ],
+            route_to=None,
+            saved_at_step=9,
+        ),
+        notes={},
+        trace=conservative_trace,
+    )
+    unsafe_report = grade_case(
+        scenario,
+        line_resolutions={
+            "L1": LineResolution(
+                resolution_id="LR-L1",
+                line_id="L1",
+                disposition=Disposition.APPROVE,
+                reason_codes=[
+                    ReasonCode.MATCHED_TO_PO_AND_RECEIPT,
+                    ReasonCode.SAFE_TO_PAY,
+                ],
+                evidence_refs=[
+                    "art-po",
+                    "art-receipts",
+                    "EX-POSSIBLE-DUP",
+                    "duplicate_check:normalized_invoice_no",
+                    "CAND-NORM-01",
+                ],
+                route_to=None,
+                saved_at_step=8,
+            ),
+            "L2": LineResolution(
+                resolution_id="LR-L2",
+                line_id="L2",
+                disposition=Disposition.APPROVE,
+                reason_codes=[ReasonCode.SAFE_TO_PAY],
+                evidence_refs=[
+                    "art-invoice",
+                    "art-receipts",
+                    "EX-RECEIPT-L2",
+                    "art-policy",
+                ],
+                route_to=None,
+                saved_at_step=9,
+            ),
+        },
+        header_resolution=HeaderResolution(
+            resolution_id="HR-001",
+            payment_recommendation=PaymentRecommendation.RELEASE_APPROVED_LINES,
+            reason_codes=[ReasonCode.SAFE_TO_PAY],
+            evidence_refs=[
+                "art-policy",
+                "art-receipts",
+                "EX-RECEIPT-L2",
+                "duplicate_check:normalized_invoice_no",
+                "CAND-NORM-01",
+            ],
+            route_to=None,
+            saved_at_step=10,
+        ),
+        notes={},
+        trace=full_trace,
+    )
+    assert best_report.decision_band is DecisionBand.BEST
+    assert safe_report.decision_band is DecisionBand.SAFE_SUBOPTIMAL
+    assert wrong_report.decision_band is DecisionBand.WRONG
+    assert unsafe_report.decision_band is DecisionBand.UNSAFE
+    assert (
+        best_report.total_score
+        > safe_report.total_score
+        > wrong_report.total_score
+        > unsafe_report.total_score
+    )
+    assert best_report.total_score > 0.95
+    assert 0.55 < safe_report.total_score < 0.80
+    assert wrong_report.total_score < 0.45
+    assert unsafe_report.total_score < 0.15
+def test_hard_composition_rewards_mixed_judgment_and_penalizes_templates() -> None:
+    scenario = load_scenario(task_id="hard")
+    full_trace = ReviewTrace(
+        ref_steps={
+            "art-invoice": 1,
+            "EX-POSSIBLE-DUP": 2,
+            "duplicate_check:normalized_invoice_no": 3,
+            "CAND-NORM-01": 3,
+            "art-receipts": 4,
+            "EX-RECEIPT-L1": 5,
+            "EX-RECEIPT-L2": 6,
+            "art-history": 7,
+            "art-vendor": 8,
+            "EX-TAX-001": 9,
+            "art-policy": 10,
+        },
+        steps_used=17,
+    )
+    best_report = grade_case(
+        scenario,
+        line_resolutions={
+            "L1": LineResolution(
+                resolution_id="LR-L1",
+                line_id="L1",
+                disposition=Disposition.APPROVE,
+                reason_codes=[
+                    ReasonCode.PARTIAL_RECEIPT_PENDING,
+                    ReasonCode.SAFE_TO_PAY,
+                ],
+                evidence_refs=[
+                    "art-invoice",
+                    "art-receipts",
+                    "EX-RECEIPT-L1",
+                    "art-policy",
+                    "duplicate_check:normalized_invoice_no",
+                    "CAND-NORM-01",
+                ],
+                route_to=None,
+                saved_at_step=12,
+            ),
+            "L2": LineResolution(
+                resolution_id="LR-L2",
+                line_id="L2",
+                disposition=Disposition.HOLD,
+                reason_codes=[ReasonCode.RECEIPT_NOT_CONFIRMED],
+                evidence_refs=[
+                    "art-receipts",
+                    "art-history",
+                    "EX-RECEIPT-L2",
+                    "art-policy",
+                ],
+                route_to=RouteTarget.RECEIVING,
+                saved_at_step=13,
+            ),
+            "L3": LineResolution(
+                resolution_id="LR-L3",
+                line_id="L3",
+                disposition=Disposition.APPROVE,
+                reason_codes=[
+                    ReasonCode.MATCHED_TO_PO_AND_RECEIPT,
+                    ReasonCode.SAFE_TO_PAY,
+                ],
+                evidence_refs=[
+                    "art-invoice",
+                    "art-receipts",
+                    "duplicate_check:normalized_invoice_no",
+                    "CAND-NORM-01",
+                ],
+                route_to=None,
+                saved_at_step=14,
+            ),
+        },
+        header_resolution=HeaderResolution(
+            resolution_id="HR-001",
+            payment_recommendation=PaymentRecommendation.HOLD_FULL_INVOICE,
+            reason_codes=[ReasonCode.TAX_AMOUNT_MISMATCH],
+            evidence_refs=[
+                "art-invoice",
+                "art-vendor",
+                "art-policy",
+                "EX-TAX-001",
+            ],
+            route_to=RouteTarget.TAX,
+            saved_at_step=15,
+        ),
+        notes={},
+        trace=full_trace,
+    )
+    blanket_hold = grade_case(
+        scenario,
+        line_resolutions={
+            "L1": LineResolution(
+                resolution_id="LR-L1",
+                line_id="L1",
+                disposition=Disposition.HOLD,
+                reason_codes=[ReasonCode.PARTIAL_RECEIPT_PENDING],
+                evidence_refs=[
+                    "duplicate_check:normalized_invoice_no",
+                    "CAND-NORM-01",
+                    "EX-RECEIPT-L1",
+                    "art-policy",
+                ],
+                route_to=None,
+                saved_at_step=12,
+            ),
+            "L2": LineResolution(
+                resolution_id="LR-L2",
+                line_id="L2",
+                disposition=Disposition.HOLD,
+                reason_codes=[ReasonCode.RECEIPT_NOT_CONFIRMED],
+                evidence_refs=["art-history", "EX-RECEIPT-L2"],
+                route_to=RouteTarget.RECEIVING,
+                saved_at_step=13,
+            ),
+            "L3": LineResolution(
+                resolution_id="LR-L3",
+                line_id="L3",
+                disposition=Disposition.HOLD,
+                reason_codes=[ReasonCode.POSSIBLE_DUPLICATE_REVIEW],
+                evidence_refs=[
+                    "duplicate_check:normalized_invoice_no",
+                    "CAND-NORM-01",
+                ],
+                route_to=None,
+                saved_at_step=14,
+            ),
+        },
+        header_resolution=HeaderResolution(
+            resolution_id="HR-001",
+            payment_recommendation=PaymentRecommendation.HOLD_FULL_INVOICE,
+            reason_codes=[ReasonCode.TAX_AMOUNT_MISMATCH],
+            evidence_refs=["art-vendor", "art-policy", "EX-TAX-001"],
+            route_to=RouteTarget.TAX,
+            saved_at_step=15,
+        ),
+        notes={},
+        trace=full_trace,
+    )
+    unsafe_release = grade_case(
+        scenario,
+        line_resolutions={
+            "L1": LineResolution(
+                resolution_id="LR-L1",
+                line_id="L1",
+                disposition=Disposition.APPROVE,
+                reason_codes=[
+                    ReasonCode.PARTIAL_RECEIPT_PENDING,
+                    ReasonCode.SAFE_TO_PAY,
+                ],
+                evidence_refs=[
+                    "art-invoice",
+                    "art-receipts",
+                    "EX-RECEIPT-L1",
+                    "art-policy",
+                    "duplicate_check:normalized_invoice_no",
+                    "CAND-NORM-01",
+                ],
+                route_to=None,
+                saved_at_step=12,
+            ),
+            "L2": LineResolution(
+                resolution_id="LR-L2",
+                line_id="L2",
+                disposition=Disposition.HOLD,
+                reason_codes=[ReasonCode.RECEIPT_NOT_CONFIRMED],
+                evidence_refs=[
+                    "art-receipts",
+                    "art-history",
+                    "EX-RECEIPT-L2",
+                    "art-policy",
+                ],
+                route_to=RouteTarget.RECEIVING,
+                saved_at_step=13,
+            ),
+            "L3": LineResolution(
+                resolution_id="LR-L3",
+                line_id="L3",
+                disposition=Disposition.APPROVE,
+                reason_codes=[
+                    ReasonCode.MATCHED_TO_PO_AND_RECEIPT,
+                    ReasonCode.SAFE_TO_PAY,
+                ],
+                evidence_refs=[
+                    "art-invoice",
+                    "art-receipts",
+                    "duplicate_check:normalized_invoice_no",
+                    "CAND-NORM-01",
+                ],
+                route_to=None,
+                saved_at_step=14,
+            ),
+        },
+        header_resolution=HeaderResolution(
+            resolution_id="HR-001",
+            payment_recommendation=PaymentRecommendation.RELEASE_APPROVED_LINES,
+            reason_codes=[ReasonCode.SAFE_TO_PAY],
+            evidence_refs=[
+                "art-po",
+                "art-receipts",
+                "duplicate_check:normalized_invoice_no",
+                "CAND-NORM-01",
+            ],
+            route_to=None,
+            saved_at_step=15,
+        ),
+        notes={},
+        trace=full_trace,
+    )
+    assert best_report.decision_band is DecisionBand.BEST
+    assert blanket_hold.decision_band is DecisionBand.SAFE_SUBOPTIMAL
+    assert unsafe_release.decision_band is DecisionBand.UNSAFE
+    assert best_report.total_score > blanket_hold.total_score > unsafe_release.total_score
+    assert best_report.total_score > 0.95
+    assert 0.55 < blanket_hold.total_score < 0.75
+    assert unsafe_release.total_score <= 0.15
+def test_hard_conservative_partial_evidence_scores_safe_suboptimal() -> None:
+    scenario = load_scenario(task_id="hard")
+    partial_trace = ReviewTrace(
+        ref_steps={
+            "EX-POSSIBLE-DUP": 1,
+            "duplicate_check:normalized_invoice_no": 2,
+            "CAND-NORM-01": 2,
+            "EX-RECEIPT-L1": 3,
+            "EX-RECEIPT-L2": 4,
+            "art-receipts": 5,
+            "art-vendor": 6,
+            "art-policy": 7,
+            "EX-TAX-001": 8,
+        },
+        steps_used=12,
+    )
+    report = grade_case(
+        scenario,
+        line_resolutions={
+            "L1": LineResolution(
+                resolution_id="LR-L1",
+                line_id="L1",
+                disposition=Disposition.HOLD,
+                reason_codes=[ReasonCode.PARTIAL_RECEIPT_PENDING],
+                evidence_refs=[
+                    "art-receipts",
+                    "duplicate_check:normalized_invoice_no",
+                    "CAND-NORM-01",
+                    "EX-RECEIPT-L1",
+                ],
+                route_to=None,
+                saved_at_step=8,
+            ),
+            "L2": LineResolution(
+                resolution_id="LR-L2",
+                line_id="L2",
+                disposition=Disposition.HOLD,
+                reason_codes=[ReasonCode.RECEIPT_NOT_CONFIRMED],
+                evidence_refs=["art-receipts", "EX-RECEIPT-L2"],
+                route_to=RouteTarget.RECEIVING,
+                saved_at_step=9,
+            ),
+            "L3": LineResolution(
+                resolution_id="LR-L3",
+                line_id="L3",
+                disposition=Disposition.HOLD,
+                reason_codes=[ReasonCode.POSSIBLE_DUPLICATE_REVIEW],
+                evidence_refs=[
+                    "art-receipts",
+                    "duplicate_check:normalized_invoice_no",
+                    "CAND-NORM-01",
+                ],
+                route_to=None,
+                saved_at_step=10,
+            ),
+        },
+        header_resolution=HeaderResolution(
+            resolution_id="HR-001",
+            payment_recommendation=PaymentRecommendation.HOLD_FULL_INVOICE,
+            reason_codes=[ReasonCode.TAX_AMOUNT_MISMATCH],
+            evidence_refs=["EX-TAX-001", "art-vendor", "art-policy"],
+            route_to=RouteTarget.TAX,
+            saved_at_step=11,
+        ),
+        notes={},
+        trace=partial_trace,
+    )
+    assert report.decision_band is DecisionBand.SAFE_SUBOPTIMAL
+    assert 0.55 < report.total_score < 0.80
+def test_hard_shortcut_approvals_without_invoice_and_history_stay_wrong() -> None:
+    scenario = load_scenario(task_id="hard")
+    shortcut_trace = ReviewTrace(
+        ref_steps={
+            "EX-POSSIBLE-DUP": 1,
+            "duplicate_check:normalized_invoice_no": 2,
+            "CAND-NORM-01": 2,
+            "EX-RECEIPT-L1": 3,
+            "EX-RECEIPT-L2": 4,
+            "art-receipts": 5,
+            "art-vendor": 6,
+            "art-policy": 7,
+            "EX-TAX-001": 8,
+        },
+        steps_used=12,
+    )
+    report = grade_case(
+        scenario,
+        line_resolutions={
+            "L1": LineResolution(
+                resolution_id="LR-L1",
+                line_id="L1",
+                disposition=Disposition.APPROVE,
+                reason_codes=[ReasonCode.PARTIAL_RECEIPT_PENDING],
+                evidence_refs=["art-receipts", "EX-RECEIPT-L1"],
+                route_to=None,
+                saved_at_step=8,
+            ),
+            "L2": LineResolution(
+                resolution_id="LR-L2",
+                line_id="L2",
+                disposition=Disposition.HOLD,
+                reason_codes=[ReasonCode.RECEIPT_NOT_CONFIRMED],
+                evidence_refs=["art-receipts", "EX-RECEIPT-L2"],
+                route_to=RouteTarget.RECEIVING,
+                saved_at_step=9,
+            ),
+            "L3": LineResolution(
+                resolution_id="LR-L3",
+                line_id="L3",
+                disposition=Disposition.APPROVE,
+                reason_codes=[ReasonCode.MATCHED_TO_PO_AND_RECEIPT],
+                evidence_refs=["art-receipts"],
+                route_to=None,
+                saved_at_step=10,
+            ),
+        },
+        header_resolution=HeaderResolution(
+            resolution_id="HR-001",
+            payment_recommendation=PaymentRecommendation.HOLD_FULL_INVOICE,
+            reason_codes=[ReasonCode.TAX_AMOUNT_MISMATCH],
+            evidence_refs=["EX-TAX-001", "art-vendor", "art-policy"],
+            route_to=RouteTarget.TAX,
+            saved_at_step=11,
+        ),
+        notes={},
+        trace=shortcut_trace,
+    )
+    assert report.decision_band is DecisionBand.WRONG
+    assert report.core_decision_score < 0.35
+    assert report.total_score < 0.35

tests/test_validation_smoke.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import subprocess
+from pathlib import Path
+ROOT = Path(__file__).resolve().parents[1]
+OPENENV_PROJECT = ROOT.parent / "OpenEnv"
+if not OPENENV_PROJECT.exists():
+    OPENENV_PROJECT = ROOT.parent / "markov" / "OpenEnv"
+def test_openenv_validate_passes() -> None:
+    result = subprocess.run(
+        [
+            "uv",
+            "run",
+            "--project",
+            str(OPENENV_PROJECT),
+            "openenv",
+            "validate",
+            str(ROOT),
+        ],
+        check=False,
+        capture_output=True,
+        text=True,
+    )
+    assert result.returncode == 0, result.stdout + result.stderr
+    assert "[OK]" in result.stdout

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff