"""Reproducible baseline for InvoiceOps."""

from __future__ import annotations

import json
import os
import re
import sys
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Callable, TypeVar

from openai import OpenAI

from invoiceops_env import InvoiceOpsAction, InvoiceOpsEnv
from invoiceops_env.models import (
    ActionType,
    Disposition,
    DuplicateCandidate,
    DuplicateMatchStrategy,
    ExceptionDetail,
    InvoiceOpsObservation,
    NoteType,
    PaymentRecommendation,
    QueueCard,
    ReasonCode,
    RouteTarget,
    TaskId,
)

ENV_URL = os.getenv("ENV_URL", "https://ehsaaniqbal-invoiceops-env.hf.space")
DEFAULT_HF_ROUTER_BASE_URL = "https://router.huggingface.co/v1"
API_BASE_URL = os.getenv("API_BASE_URL", DEFAULT_HF_ROUTER_BASE_URL)
MODEL_NAME = os.getenv("MODEL_NAME", "zai-org/GLM-5.1")
TEMPERATURE = 0.0
MAX_TOKENS = int(os.getenv("MAX_TOKENS", "3000"))
RETRY_MAX_TOKENS = max(MAX_TOKENS, int(os.getenv("RETRY_MAX_TOKENS", "5000")))
MAX_MODEL_ATTEMPTS = 2
BENCHMARK = "invoiceops_env"
OUTPUT_DIR = Path(__file__).resolve().parent / "outputs" / "evals"
EVAL_RUN_NAME = os.getenv("EVAL_RUN_NAME")
TASKS = [
    TaskId.EASY,
    TaskId.MEDIUM,
    TaskId.MEDIUM_PLUS,
    TaskId.HARD,
]
HEADER_DISPOSITION_MAP: dict[Disposition, PaymentRecommendation] = {
    Disposition.APPROVE: PaymentRecommendation.RELEASE_APPROVED_LINES,
    Disposition.HOLD: PaymentRecommendation.HOLD_FULL_INVOICE,
    Disposition.REJECT: PaymentRecommendation.REJECT_FULL_INVOICE,
    Disposition.ESCALATE: PaymentRecommendation.ESCALATE_CASE,
}
ParsedModelOutput = TypeVar("ParsedModelOutput")


def _env_flag(name: str, default: bool) -> bool:
    raw_value = os.getenv(name)
    if raw_value is None:
        return default
    return raw_value.strip().lower() not in {"0", "false", "no", "off", ""}


VERBOSE_STDERR = _env_flag("INFERENCE_VERBOSE_STDERR", False)


def strict_task_score(raw_score: float, *, used_fallback: bool) -> float:
    if used_fallback and _env_flag("STRICT_BASELINE_SCORING", True):
        return 0.0
    return raw_score


@dataclass
class EpisodeTrace:
    rewards: list[float] = field(default_factory=list)
    steps_taken: int = 0


@dataclass
class ObservationMemory:
    opened_artifacts: dict[str, Any] = field(default_factory=dict)
    inspected_exceptions: dict[str, ExceptionDetail] = field(default_factory=dict)
    duplicate_candidates: list[DuplicateCandidate] = field(default_factory=list)


def resolve_api_key() -> tuple[str | None, str | None]:
    token = os.getenv("HF_TOKEN")
    return (token, "HF_TOKEN") if token else (None, None)


def _slugify(value: str) -> str:
    slug = re.sub(r"[^A-Za-z0-9._-]+", "-", value.strip())
    slug = slug.strip("-._")
    return slug or "run"


def build_output_path(model_name: str) -> tuple[str, Path]:
    timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
    run_id = _slugify(EVAL_RUN_NAME) if EVAL_RUN_NAME else timestamp
    model_slug = _slugify(model_name)
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

    candidate = OUTPUT_DIR / f"{run_id}__{model_slug}.json"
    suffix = 2
    while candidate.exists():
        candidate = OUTPUT_DIR / f"{run_id}__{model_slug}__{suffix}.json"
        suffix += 1
    return run_id, candidate


def _sanitize_log_value(value: str | None) -> str:
    if not value:
        return "null"
    return value.replace("\n", " ").strip() or "null"


def format_action_for_log(action: InvoiceOpsAction) -> str:
    return json.dumps(
        action.model_dump(mode="json", exclude_none=True),
        separators=(",", ":"),
        sort_keys=True,
    )


def _extract_step_error(
    observation: InvoiceOpsObservation | None,
    *,
    previous_invalid_actions: int,
) -> str | None:
    if observation is None:
        return None
    if observation.progress.invalid_actions > previous_invalid_actions:
        return observation.message or None
    return None


def log_start(task: str, env: str, model: str) -> None:
    print(f"[START] task={task} env={env} model={model}", flush=True)


def log_step(
    step: int, action: str, reward: float, done: bool, error: str | None
) -> None:
    print(
        f"[STEP] step={step} action={action} reward={reward:.2f} "
        f"done={str(done).lower()} error={_sanitize_log_value(error)}",
        flush=True,
    )


def log_end(success: bool, steps: int, score: float, rewards: list[float]) -> None:
    rewards_str = ",".join(f"{reward:.2f}" for reward in rewards)
    print(
        f"[END] success={str(success).lower()} steps={steps} "
        f"score={score:.3f} rewards={rewards_str}",
        flush=True,
    )


def _safe_json_load(text: str) -> dict[str, Any] | None:
    text = text.strip()
    if not text:
        return None

    text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL | re.IGNORECASE)
    text = re.sub(
        r"<reasoning>.*?</reasoning>",
        "",
        text,
        flags=re.DOTALL | re.IGNORECASE,
    )

    if text.startswith("```"):
        text = re.sub(r"^```(?:json)?\s*", "", text)
        text = re.sub(r"\s*```$", "", text)

    try:
        payload = json.loads(text)
    except json.JSONDecodeError:
        match = re.search(r"\{.*\}", text, re.DOTALL)
        if not match:
            return None
        try:
            payload = json.loads(match.group(0))
        except json.JSONDecodeError:
            return None

    return payload if isinstance(payload, dict) else None


def _normalize_completion_content(raw_content: Any) -> str:
    if raw_content is None:
        return ""
    if isinstance(raw_content, str):
        return raw_content
    if isinstance(raw_content, list):
        parts: list[str] = []
        for item in raw_content:
            if isinstance(item, dict):
                text = item.get("text")
                if isinstance(text, str):
                    parts.append(text)
                continue
            text = getattr(item, "text", None)
            if isinstance(text, str):
                parts.append(text)
        return "\n".join(part for part in parts if part)
    return str(raw_content)


def _attempt_trace(
    *,
    completion: Any | None = None,
    content: str = "",
    payload: dict[str, Any] | None = None,
    parsed_ok: bool = False,
    failure_reason: str | None = None,
    error: Exception | None = None,
) -> dict[str, Any]:
    trace: dict[str, Any] = {
        "content": content,
        "content_empty": not bool(content.strip()),
        "json_detected": payload is not None,
        "validation_passed": parsed_ok,
        "failure_reason": failure_reason,
    }

    if error is not None:
        trace["error_type"] = error.__class__.__name__
        trace["error_message"] = str(error)

    if completion is None:
        return trace

    trace["response_id"] = getattr(completion, "id", None)
    choices = getattr(completion, "choices", None) or []
    if choices:
        choice = choices[0]
        trace["finish_reason"] = getattr(choice, "finish_reason", None)
        message = getattr(choice, "message", None)
        if message is not None:
            if hasattr(message, "model_dump"):
                trace["raw_message"] = message.model_dump(
                    mode="json", exclude_none=True
                )
            else:
                trace["raw_message"] = str(message)

    usage = getattr(completion, "usage", None)
    if usage is not None and hasattr(usage, "model_dump"):
        trace["usage"] = usage.model_dump(mode="json", exclude_none=True)

    return trace


def _query_model_json(
    openai_client: OpenAI,
    *,
    system_prompt: str,
    user_prompt: str,
    validator: Callable[[dict[str, Any] | None], ParsedModelOutput | None],
    retry_feedback: str,
) -> tuple[ParsedModelOutput | None, list[dict[str, Any]]]:
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ]
    attempts: list[dict[str, Any]] = []

    for attempt in range(MAX_MODEL_ATTEMPTS):
        expand_token_budget = bool(
            attempts and attempts[-1].get("finish_reason") == "length"
        )
        try:
            completion = openai_client.chat.completions.create(
                model=MODEL_NAME,
                messages=messages,
                temperature=TEMPERATURE,
                response_format={"type": "json_object"},
                max_tokens=(RETRY_MAX_TOKENS if expand_token_budget else MAX_TOKENS),
            )
        except Exception as exc:
            attempts.append(
                _attempt_trace(
                    failure_reason="request_error",
                    error=exc,
                )
            )
            if attempt == MAX_MODEL_ATTEMPTS - 1:
                break
            messages.append(
                {
                    "role": "user",
                    "content": (
                        "The previous request failed before a usable response was returned. "
                        f"{retry_feedback} Reply with JSON only and no prose."
                    ),
                }
            )
            continue

        choices = getattr(completion, "choices", None) or []
        if not choices:
            attempts.append(
                _attempt_trace(
                    completion=completion,
                    failure_reason="no_choices",
                )
            )
            if attempt == MAX_MODEL_ATTEMPTS - 1:
                break
            messages.append(
                {
                    "role": "user",
                    "content": (
                        "The previous reply did not contain any choices. "
                        f"{retry_feedback} Reply with JSON only and no prose."
                    ),
                }
            )
            continue

        message = choices[0].message
        content = _normalize_completion_content(getattr(message, "content", None))
        payload = _safe_json_load(content)
        parsed = validator(payload)
        if parsed is not None:
            attempts.append(
                _attempt_trace(
                    completion=completion,
                    content=content,
                    payload=payload,
                    parsed_ok=True,
                )
            )
            return parsed, attempts

        if not content.strip():
            failure_reason = "empty_content"
        elif payload is None:
            failure_reason = "json_not_found"
        else:
            failure_reason = "schema_validation_failed"

        attempts.append(
            _attempt_trace(
                completion=completion,
                content=content,
                payload=payload,
                parsed_ok=False,
                failure_reason=failure_reason,
            )
        )

        if attempt == MAX_MODEL_ATTEMPTS - 1:
            break

        messages.extend(
            [
                {"role": "assistant", "content": content or "<empty_response>"},
                {
                    "role": "user",
                    "content": (
                        "Your previous reply could not be used. "
                        f"{retry_feedback} Reply with JSON only and no prose."
                    ),
                },
            ]
        )

    return None, attempts


def _coerce_reason_codes(values: Any) -> list[ReasonCode]:
    if isinstance(values, str):
        raw_values = [values]
    elif isinstance(values, list):
        raw_values = values
    else:
        return []

    codes: list[ReasonCode] = []
    for value in raw_values:
        if not isinstance(value, str):
            continue
        try:
            code = ReasonCode(value)
        except ValueError:
            continue
        if code not in codes:
            codes.append(code)
    return codes


def _coerce_string_list(values: Any) -> list[str]:
    if isinstance(values, str):
        raw_values = [values]
    elif isinstance(values, list):
        raw_values = values
    else:
        return []

    refs: list[str] = []
    for value in raw_values:
        if not isinstance(value, str):
            continue
        ref = value.strip()
        if not ref or ref in refs:
            continue
        refs.append(ref)
    return refs


def _coerce_action_type(value: Any) -> ActionType | None:
    if not isinstance(value, str):
        return None
    try:
        return ActionType(value)
    except ValueError:
        return None


def _coerce_match_strategy(value: Any) -> DuplicateMatchStrategy | None:
    if not isinstance(value, str):
        return None
    normalized = value.strip().lower()
    aliases = {
        "exact_invoice_no": DuplicateMatchStrategy.EXACT_INVOICE_NUMBER,
        "exact_invoice_number": DuplicateMatchStrategy.EXACT_INVOICE_NUMBER,
        "invoice_number_exact": DuplicateMatchStrategy.EXACT_INVOICE_NUMBER,
        "normalized_invoice_no": DuplicateMatchStrategy.NORMALIZED_INVOICE_NUMBER,
        "normalized_invoice_number": DuplicateMatchStrategy.NORMALIZED_INVOICE_NUMBER,
        "normalized_invoice": DuplicateMatchStrategy.NORMALIZED_INVOICE_NUMBER,
        "vendor_amount_date": DuplicateMatchStrategy.VENDOR_AMOUNT_DATE,
        "vendor_amount": DuplicateMatchStrategy.VENDOR_AMOUNT_DATE,
        "vendor_invoice_amount": DuplicateMatchStrategy.VENDOR_AMOUNT_DATE,
        "exact_vendor_invoice_amount": DuplicateMatchStrategy.VENDOR_AMOUNT_DATE,
        "vendor_amount_and_date": DuplicateMatchStrategy.VENDOR_AMOUNT_DATE,
    }
    strategy = aliases.get(normalized)
    if strategy is not None:
        return strategy
    try:
        return DuplicateMatchStrategy(value)
    except ValueError:
        return None


def _coerce_note_type(value: Any) -> NoteType | None:
    if not isinstance(value, str):
        return None
    try:
        return NoteType(value)
    except ValueError:
        return None


def _coerce_route(value: Any) -> RouteTarget | None:
    if not isinstance(value, str):
        return None
    try:
        return RouteTarget(value)
    except ValueError:
        return None


def _coerce_disposition(value: Any) -> Disposition | None:
    if not isinstance(value, str):
        return None
    try:
        return Disposition(value)
    except ValueError:
        return None


def _coerce_payment_recommendation(
    raw_header: dict[str, Any] | str | None,
) -> PaymentRecommendation | None:
    if isinstance(raw_header, str):
        try:
            return PaymentRecommendation(raw_header)
        except ValueError:
            return None

    if not isinstance(raw_header, dict):
        return None

    for key in ("payment_recommendation", "header_recommendation", "recommendation"):
        raw_value = raw_header.get(key)
        if not isinstance(raw_value, str):
            continue
        try:
            return PaymentRecommendation(raw_value)
        except ValueError:
            continue

    disposition = _coerce_disposition(
        raw_header.get("disposition") or raw_header.get("decision")
    )
    if disposition is None:
        return None
    return HEADER_DISPOSITION_MAP.get(disposition)


def _extract_action_payload(payload: dict[str, Any] | None) -> dict[str, Any] | None:
    if payload is None:
        return None

    if isinstance(payload.get("action"), dict):
        raw_action = dict(payload["action"])
        if "action_type" not in raw_action and isinstance(
            payload.get("action_type"), str
        ):
            raw_action["action_type"] = payload["action_type"]
        return raw_action

    if isinstance(payload.get("args"), dict) and isinstance(payload.get("action"), str):
        raw_action = dict(payload["args"])
        raw_action.setdefault("action_type", payload["action"])
        return raw_action

    if isinstance(payload.get("arguments"), dict) and isinstance(
        payload.get("action"), str
    ):
        raw_action = dict(payload["arguments"])
        raw_action.setdefault("action_type", payload["action"])
        return raw_action

    return dict(payload)


def _parse_action_payload(payload: dict[str, Any] | None) -> InvoiceOpsAction | None:
    raw_action = _extract_action_payload(payload)
    if raw_action is None:
        return None

    action_type = _coerce_action_type(
        raw_action.get("action_type")
        or raw_action.get("action")
        or raw_action.get("type")
        or raw_action.get("kind")
        or raw_action.get("name")
    )
    if action_type is None:
        return None

    action_kwargs: dict[str, Any] = {
        "action_type": action_type,
    }

    if action_type is ActionType.OPEN_ARTIFACT:
        action_kwargs["artifact_id"] = (
            raw_action.get("artifact_id")
            or raw_action.get("artifact")
            or raw_action.get("id")
        )
    elif action_type is ActionType.INSPECT_EXCEPTION:
        action_kwargs["exception_id"] = (
            raw_action.get("exception_id")
            or raw_action.get("exception")
            or raw_action.get("id")
        )
    elif action_type is ActionType.RUN_DUPLICATE_CHECK:
        match_strategy = raw_action.get("match_strategy") or raw_action.get("strategy")
        action_kwargs["match_strategy"] = _coerce_match_strategy(match_strategy)
        if action_kwargs["match_strategy"] is None:
            return None
    elif action_type is ActionType.ADD_NOTE:
        action_kwargs["note_type"] = _coerce_note_type(
            raw_action.get("note_type") or raw_action.get("note_kind")
        )
        action_kwargs["reason_codes"] = _coerce_reason_codes(
            raw_action.get("reason_codes") or raw_action.get("reason_code")
        )
        action_kwargs["evidence_refs"] = _coerce_string_list(
            raw_action.get("evidence_refs")
            or raw_action.get("evidence_ref")
            or raw_action.get("refs")
        )
        action_kwargs["text"] = raw_action.get("text")
    elif action_type is ActionType.SET_LINE_RESOLUTION:
        action_kwargs["line_id"] = raw_action.get("line_id") or raw_action.get("line")
        action_kwargs["disposition"] = _coerce_disposition(
            raw_action.get("disposition") or raw_action.get("decision")
        )
        action_kwargs["reason_codes"] = _coerce_reason_codes(
            raw_action.get("reason_codes") or raw_action.get("reason_code")
        )
        action_kwargs["evidence_refs"] = _coerce_string_list(
            raw_action.get("evidence_refs")
            or raw_action.get("evidence_ref")
            or raw_action.get("refs")
        )
        action_kwargs["route_to"] = _coerce_route(
            raw_action.get("route_to")
            or raw_action.get("route")
            or raw_action.get("escalation_target")
        )
    elif action_type is ActionType.SET_HEADER_RESOLUTION:
        action_kwargs["payment_recommendation"] = _coerce_payment_recommendation(
            raw_action
        )
        action_kwargs["reason_codes"] = _coerce_reason_codes(
            raw_action.get("reason_codes") or raw_action.get("reason_code")
        )
        action_kwargs["evidence_refs"] = _coerce_string_list(
            raw_action.get("evidence_refs")
            or raw_action.get("evidence_ref")
            or raw_action.get("refs")
        )
        action_kwargs["route_to"] = _coerce_route(
            raw_action.get("route_to")
            or raw_action.get("route")
            or raw_action.get("escalation_target")
        )
    elif action_type is ActionType.SUBMIT_CASE:
        action_kwargs["note_ids"] = _coerce_string_list(raw_action.get("note_ids"))
        action_kwargs["line_resolution_ids"] = _coerce_string_list(
            raw_action.get("line_resolution_ids")
        )
        header_resolution_id = raw_action.get("header_resolution_id")
        if isinstance(header_resolution_id, str):
            action_kwargs["header_resolution_id"] = header_resolution_id.strip()

    try:
        return InvoiceOpsAction(**action_kwargs)
    except Exception:
        return None


def build_case_snapshot(
    queue_card: QueueCard,
    opened_artifacts: dict[str, Any],
    inspected_exceptions: dict[str, ExceptionDetail],
    duplicate_candidates: list[DuplicateCandidate],
) -> dict[str, Any]:
    def compact_text(value: str, *, limit: int = 180) -> str:
        normalized = re.sub(r"\s+", " ", value.strip())
        if len(normalized) <= limit:
            return normalized
        return f"{normalized[: limit - 3].rstrip()}..."

    def compact_fields(fields: list[Any], *, limit: int = 10) -> dict[str, str]:
        compact: dict[str, str] = {}
        for field in fields[:limit]:
            label = field.label.strip()
            value = field.value.strip()
            if not label or not value:
                continue
            compact[label] = compact_text(value, limit=120)
        return compact

    def compact_line_items(
        line_items: list[Any], *, limit: int = 6
    ) -> list[dict[str, Any]]:
        compact_items: list[dict[str, Any]] = []
        for item in line_items[:limit]:
            compact_item: dict[str, Any] = {
                "line_id": item.line_id,
                "description": compact_text(item.description, limit=100),
                "amount": item.amount,
            }
            if item.quantity is not None:
                compact_item["quantity"] = item.quantity
            if item.unit_price is not None:
                compact_item["unit_price"] = item.unit_price
            if item.status:
                compact_item["status"] = compact_text(item.status, limit=60)
            if item.notes:
                compact_item["notes"] = compact_text(item.notes, limit=100)
            compact_items.append(compact_item)
        return compact_items

    def compact_events(events: list[Any], *, limit: int = 8) -> list[dict[str, Any]]:
        compact_events_list: list[dict[str, Any]] = []
        for event in events[:limit]:
            compact_event: dict[str, Any] = {
                "type": event.event_type,
                "date": event.event_date,
                "description": compact_text(event.description, limit=120),
            }
            if event.quantity is not None:
                compact_event["quantity"] = event.quantity
            if event.amount is not None:
                compact_event["amount"] = event.amount
            if event.status:
                compact_event["status"] = compact_text(event.status, limit=60)
            compact_events_list.append(compact_event)
        return compact_events_list

    def compact_artifact(artifact: Any) -> dict[str, Any]:
        compact_artifact_view: dict[str, Any] = {
            "title": artifact.title,
        }
        if artifact.summary:
            compact_artifact_view["summary"] = compact_text(artifact.summary)
        fields = compact_fields(artifact.fields)
        if fields:
            compact_artifact_view["fields"] = fields
        line_items = compact_line_items(artifact.line_items)
        if line_items:
            compact_artifact_view["line_items"] = line_items
        events = compact_events(artifact.events)
        if events:
            compact_artifact_view["events"] = events
        return compact_artifact_view

    def compact_exception(exception: ExceptionDetail) -> dict[str, Any]:
        compact_exception_view: dict[str, Any] = {
            "type": exception.exception_type.value,
            "severity": exception.severity.value,
            "headline": compact_text(exception.headline, limit=120),
        }
        if exception.impacted_line_ids:
            compact_exception_view["impacted_line_ids"] = exception.impacted_line_ids
        if exception.short_description:
            compact_exception_view["summary"] = compact_text(
                exception.short_description,
                limit=140,
            )
        fields = compact_fields(exception.fields, limit=8)
        if fields:
            compact_exception_view["facts"] = fields
        if exception.reviewer_guidance:
            compact_exception_view["guidance"] = compact_text(
                exception.reviewer_guidance,
                limit=160,
            )
        return compact_exception_view

    def compact_duplicate(candidate: DuplicateCandidate) -> dict[str, Any]:
        return {
            "candidate_id": candidate.candidate_id,
            "invoice_number": candidate.invoice_number,
            "invoice_date": candidate.invoice_date,
            "gross_amount": candidate.gross_amount,
            "status": candidate.status,
            "match_basis": compact_text(candidate.match_basis, limit=80),
            "overlap_summary": compact_text(candidate.overlap_summary, limit=140),
        }

    return {
        "queue_card": {
            "vendor_name": queue_card.vendor_name,
            "vendor_id": queue_card.vendor_id,
            "invoice_number": queue_card.invoice_number,
            "invoice_date": queue_card.invoice_date,
            "invoice_total": queue_card.invoice_total,
            "currency": queue_card.currency,
            "po_number": queue_card.po_number,
            "risk_flags": [flag.value for flag in queue_card.risk_flags],
            "summary": compact_text(queue_card.summary, limit=160),
        },
        "artifacts": {
            artifact.artifact_type.value: compact_artifact(artifact)
            for artifact in opened_artifacts.values()
        },
        "exceptions": [
            compact_exception(exception) for exception in inspected_exceptions.values()
        ],
        "duplicate_candidates": [
            compact_duplicate(candidate) for candidate in duplicate_candidates
        ],
    }


def update_memory(
    memory: ObservationMemory,
    observation: InvoiceOpsObservation,
) -> None:
    if observation.opened_artifact is not None:
        memory.opened_artifacts[observation.opened_artifact.artifact_id] = (
            observation.opened_artifact
        )
    if observation.inspected_exception is not None:
        memory.inspected_exceptions[observation.inspected_exception.exception_id] = (
            observation.inspected_exception
        )
    if observation.duplicate_candidates:
        memory.duplicate_candidates = observation.duplicate_candidates


def build_observation_snapshot(
    observation: InvoiceOpsObservation,
    memory: ObservationMemory,
) -> dict[str, Any]:
    queue_card = observation.queue_card
    assert queue_card is not None

    base_snapshot = build_case_snapshot(
        queue_card,
        memory.opened_artifacts,
        memory.inspected_exceptions,
        memory.duplicate_candidates,
    )
    base_snapshot["message"] = observation.message
    base_snapshot["progress"] = observation.progress.model_dump(mode="json")
    base_snapshot["known_refs"] = observation.known_refs
    base_snapshot["available_artifacts"] = [
        artifact.model_dump(mode="json") for artifact in observation.available_artifacts
    ]
    base_snapshot["visible_exceptions"] = [
        exception.model_dump(mode="json")
        for exception in observation.visible_exceptions
    ]
    base_snapshot["current_focus"] = {
        "opened_artifact_id": (
            observation.opened_artifact.artifact_id
            if observation.opened_artifact is not None
            else None
        ),
        "inspected_exception_id": (
            observation.inspected_exception.exception_id
            if observation.inspected_exception is not None
            else None
        ),
    }
    base_snapshot["draft_state"] = {
        "line_resolutions": [
            line_resolution.model_dump(mode="json")
            for line_resolution in observation.draft_line_resolutions
        ],
        "header_resolution": (
            observation.draft_header_resolution.model_dump(mode="json")
            if observation.draft_header_resolution is not None
            else None
        ),
        "notes": [note.model_dump(mode="json") for note in observation.draft_notes],
    }
    return base_snapshot


def build_action_prompt(
    observation: InvoiceOpsObservation,
    memory: ObservationMemory,
) -> str:
    snapshot = build_observation_snapshot(observation, memory)
    return (
        "You are controlling an AP invoice exception environment one action at a time.\n"
        "Return exactly one JSON object for the single best next action. No prose. No markdown. No multi-action plans.\n"
        "Do not assume you have seen artifacts or exception details that are not in the observation snapshot.\n"
        "Use open_artifact, inspect_exception, and run_duplicate_check to gather evidence before deciding.\n"
        "Only use evidence_refs from known_refs. Invalid refs will be penalized.\n"
        "Only add notes or resolutions when you have enough visible evidence to support them.\n"
        "route_to means the next owner or follow-up queue for the action. Use it whenever another queue must act, including hold actions that still need follow-up.\n"
        "Line resolutions describe content/payment readiness for each line. Header resolution describes whether any payment can be released now.\n"
        "A real case-level blocker can justify hold_full_invoice or escalate_case even when some lines are approved.\n"
        "Submit only when the current draft state is coherent or when no better action remains.\n\n"
        f"Allowed action_type values: {[action.value for action in ActionType]}\n"
        f"Allowed match_strategy values: {[strategy.value for strategy in DuplicateMatchStrategy]}\n"
        f"Allowed disposition values: {[disposition.value for disposition in Disposition]}\n"
        f"Allowed payment_recommendation values: {[recommendation.value for recommendation in PaymentRecommendation]}\n"
        f"Allowed route_to values: {[route.value for route in RouteTarget]}\n"
        f"Allowed note_type values: {[note_type.value for note_type in NoteType]}\n"
        f"Allowed reason_codes values: {[reason.value for reason in ReasonCode]}\n"
        "Action JSON templates (replace angle-bracket placeholders with real values from the observation; omit optional fields when unused):\n"
        '{"action_type":"open_artifact","artifact_id":"<artifact_id>"}\n'
        '{"action_type":"inspect_exception","exception_id":"<exception_id>"}\n'
        '{"action_type":"run_duplicate_check","match_strategy":"normalized_invoice_no"}\n'
        '{"action_type":"set_line_resolution","line_id":"<line_id>","disposition":"<disposition>","reason_codes":["<reason_code>"],"evidence_refs":["<known_ref>"],"route_to":"<optional_route_target>"}\n'
        '{"action_type":"set_header_resolution","payment_recommendation":"<payment_recommendation>","reason_codes":["<reason_code>"],"evidence_refs":["<known_ref>"],"route_to":"<optional_route_target>"}\n'
        '{"action_type":"add_note","note_type":"<note_type>","reason_codes":["<reason_code>"],"evidence_refs":["<known_ref>"],"text":"<brief_handoff_note>"}\n'
        '{"action_type":"submit_case"}\n\n'
        f"Observation snapshot:\n{json.dumps(snapshot, indent=2)}"
    )


def request_action_from_model(
    openai_client: OpenAI,
    *,
    observation: InvoiceOpsObservation,
    memory: ObservationMemory,
) -> tuple[InvoiceOpsAction | None, list[dict[str, Any]]]:
    return _query_model_json(
        openai_client,
        system_prompt=(
            "You are a deterministic AP invoice reviewer acting in an environment. "
            "Return exactly one valid JSON action and nothing else."
        ),
        user_prompt=build_action_prompt(observation, memory),
        validator=_parse_action_payload,
        retry_feedback=(
            "Return exactly one action object with action_type and only the fields required for that action. "
            'Examples: {"action_type":"open_artifact","artifact_id":"art-invoice"} '
            'or {"action_type":"submit_case"}. '
            "Do not output a plan or multiple actions."
        ),
    )


def run_task(
    env: Any,
    openai_client: OpenAI,
    task_id: TaskId,
    trace: EpisodeTrace,
) -> dict[str, Any]:
    try:
        reset_result = env.reset(task_id=task_id.value)
        observation = reset_result.observation
        initial_queue_card = observation.queue_card
        memory = ObservationMemory()
        update_memory(memory, observation)

        model_attempts: list[dict[str, Any]] = []
        action_history: list[dict[str, Any]] = []
        used_fallback = False
        decision_parsed = True
        failure_reason: str | None = None

        while not observation.done:
            action, attempts = request_action_from_model(
                openai_client,
                observation=observation,
                memory=memory,
            )
            model_attempts.append(
                {
                    "turn_index": len(model_attempts) + 1,
                    "attempts": attempts,
                }
            )

            if action is None:
                used_fallback = True
                decision_parsed = False
                failure_reason = (
                    attempts[-1]["failure_reason"] if attempts else "no_attempt"
                )
                action = InvoiceOpsAction(action_type=ActionType.SUBMIT_CASE)
                model_attempts[-1]["fallback_action"] = action.model_dump(
                    mode="json",
                    exclude_none=True,
                )

            previous_invalid_actions = observation.progress.invalid_actions
            result = env.step(action)
            reward = float(result.reward or 0.0)
            trace.steps_taken += 1
            trace.rewards.append(reward)
            log_step(
                trace.steps_taken,
                format_action_for_log(action),
                reward,
                bool(result.done),
                _extract_step_error(
                    result.observation,
                    previous_invalid_actions=previous_invalid_actions,
                ),
            )
            action_history.append(
                {
                    "step": trace.steps_taken,
                    "action": action.model_dump(mode="json", exclude_none=True),
                    "reward": reward,
                    "done": bool(result.done),
                    "message": result.observation.message,
                }
            )
            observation = result.observation
            update_memory(memory, observation)

        raw_score = float(observation.episode_score or 0.0)
        score = strict_task_score(raw_score, used_fallback=used_fallback)
        return {
            "task_id": task_id.value,
            "queue_card": (
                initial_queue_card.model_dump(mode="json")
                if initial_queue_card is not None
                else None
            ),
            "decision_parsed": decision_parsed,
            "used_fallback": used_fallback,
            "failure_reason": failure_reason,
            "parsed_line_count": len(observation.draft_line_resolutions),
            "parsed_header_resolution": observation.draft_header_resolution is not None,
            "model_attempts": model_attempts,
            "action_history": action_history,
            "raw_score": raw_score,
            "score": score,
            "steps_used": trace.steps_taken,
            "reward_trace": trace.rewards,
            "submission_report": (
                observation.submission_report.model_dump(mode="json")
                if observation.submission_report is not None
                else None
            ),
            "error": None,
        }
    except Exception as exc:
        return {
            "task_id": task_id.value,
            "queue_card": None,
            "decision_parsed": False,
            "used_fallback": False,
            "failure_reason": "task_execution_error",
            "parsed_line_count": 0,
            "parsed_header_resolution": False,
            "model_attempts": [],
            "action_history": [],
            "raw_score": 0.0,
            "score": 0.0,
            "steps_used": trace.steps_taken,
            "reward_trace": trace.rewards,
            "submission_report": None,
            "error": str(exc),
        }


def main() -> None:
    api_key, api_key_source = resolve_api_key()
    api_base_url = API_BASE_URL

    if not api_key:
        raise RuntimeError("Set HF_TOKEN before running inference.py.")

    openai_client = OpenAI(api_key=api_key, base_url=api_base_url)

    run_id, output_path = build_output_path(MODEL_NAME)
    results: list[dict[str, Any]] = []

    for task_id in TASKS:
        trace = EpisodeTrace()
        log_start(task=task_id.value, env=BENCHMARK, model=MODEL_NAME)
        task_result: dict[str, Any] | None = None
        try:
            with InvoiceOpsEnv(base_url=ENV_URL).sync() as env:
                task_result = run_task(env, openai_client, task_id, trace)
        finally:
            score = float(task_result["score"]) if task_result is not None else 0.0
            success = task_result is not None and task_result.get("error") is None
            log_end(
                success=success,
                steps=trace.steps_taken,
                score=score,
                rewards=trace.rewards,
            )

        assert task_result is not None
        results.append(task_result)
        if VERBOSE_STDERR:
            sys.stderr.write(
                f"{task_id.value}: score={task_result['score']:.4f} "
                f"raw_score={task_result.get('raw_score', task_result['score']):.4f} "
                f"fallback={str(task_result['used_fallback']).lower()}\n"
            )

    mean_score = sum(result["score"] for result in results) / len(results)
    raw_mean_score = sum(
        result.get("raw_score", result["score"]) for result in results
    ) / len(results)
    payload = {
        "run_id": run_id,
        "model_name": MODEL_NAME,
        "env_url": ENV_URL,
        "api_base_url": api_base_url,
        "api_key_source": api_key_source,
        "raw_mean_score": round(raw_mean_score, 4),
        "mean_score": round(mean_score, 4),
        "strict_baseline_scoring": _env_flag("STRICT_BASELINE_SCORING", True),
        "results": results,
    }
    output_path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
    if VERBOSE_STDERR:
        sys.stderr.write(
            f"mean_score={mean_score:.4f} raw_mean_score={raw_mean_score:.4f}\n"
        )
        sys.stderr.write(f"wrote={output_path}\n")


if __name__ == "__main__":
    main()