Spaces:

Imaginephoenix
/

openenv1

Sleeping

App Files Files Community

Imaginephoenix commited on Apr 7

Commit

496c5c4

verified ·

1 Parent(s): 97c9151

Upload 5 files

Browse files

Files changed (5) hide show

app.py +775 -0
environment.py +469 -0
graders.py +319 -0
inference.py +384 -0
server.py +775 -0

app.py ADDED Viewed

	@@ -0,0 +1,775 @@

+"""Auxiliary server entrypoint required by OpenEnv local validation checks."""
+import os
+from flask import Flask, Response, jsonify, request
+from environment import EmailTriageEnv
+from tasks import get_task_scenario_count, list_task_ids
+FRONTEND_HTML = """<!doctype html>
+<html lang="en">
+<head>
+    <meta charset="utf-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1" />
+    <title>Inbox Helper Practice</title>
+    <style>
+        @import url('https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@400;600;700&family=IBM+Plex+Mono:wght@400;500&display=swap');
+        :root {
+            --bg: #f5f1e9;
+            --paper: #fffaf2;
+            --ink: #102433;
+            --accent: #ea6a2a;
+            --accent-soft: #ffd6bf;
+            --line: #d7cabb;
+            --ok: #0f7b6c;
+            --warn: #9a3a12;
+            --radius: 14px;
+        }
+        * { box-sizing: border-box; }
+        body {
+            margin: 0;
+            font-family: 'Space Grotesk', sans-serif;
+            color: var(--ink);
+            background:
+                radial-gradient(1100px 460px at -10% -20%, #f2bc9f 0%, transparent 60%),
+                radial-gradient(1100px 520px at 120% 115%, #b8d7cf 0%, transparent 62%),
+                var(--bg);
+            min-height: 100vh;
+        }
+        .wrap {
+            max-width: 1100px;
+            margin: 28px auto;
+            padding: 0 16px;
+            animation: reveal .45s ease-out;
+        }
+        @keyframes reveal {
+            from { opacity: 0; transform: translateY(10px); }
+            to { opacity: 1; transform: translateY(0); }
+        }
+        .title {
+            display: flex;
+            justify-content: space-between;
+            align-items: baseline;
+            gap: 14px;
+            margin-bottom: 14px;
+        }
+        h1 {
+            margin: 0;
+            font-size: clamp(1.5rem, 2vw, 2.2rem);
+            letter-spacing: .4px;
+        }
+        .subtitle {
+            margin: 6px 0 0;
+            font-size: .95rem;
+            opacity: .8;
+        }
+        .badge {
+            background: var(--accent-soft);
+            border: 1px solid #f2b693;
+            color: #7f2e0b;
+            padding: 6px 10px;
+            border-radius: 999px;
+            font-size: .85rem;
+            font-weight: 600;
+        }
+        .grid {
+            display: grid;
+            grid-template-columns: 1fr;
+            gap: 14px;
+        }
+        @media (min-width: 900px) {
+            .grid { grid-template-columns: 1fr 1fr; }
+            .wide { grid-column: span 2; }
+        }
+        .card {
+            background: var(--paper);
+            border: 1px solid var(--line);
+            border-radius: var(--radius);
+            padding: 14px;
+            box-shadow: 0 8px 28px rgba(16, 36, 51, 0.08);
+        }
+        .card h2 {
+            margin: 0 0 10px;
+            font-size: 1rem;
+            text-transform: uppercase;
+            letter-spacing: .08em;
+            opacity: .86;
+        }
+        .row {
+            display: flex;
+            flex-wrap: wrap;
+            gap: 8px;
+            align-items: center;
+            margin-bottom: 10px;
+        }
+        select, input, textarea, button {
+            font-family: inherit;
+            font-size: .95rem;
+        }
+        select, input, textarea {
+            width: 100%;
+            border: 1px solid #cdbba6;
+            border-radius: 10px;
+            padding: 9px 10px;
+            background: #fff;
+            color: var(--ink);
+        }
+        textarea {
+            min-height: 92px;
+            resize: vertical;
+        }
+        button {
+            border: 0;
+            border-radius: 10px;
+            padding: 9px 12px;
+            font-weight: 700;
+            background: var(--ink);
+            color: #fff;
+            cursor: pointer;
+            transition: transform .12s ease, opacity .12s ease;
+        }
+        button.secondary {
+            background: #285066;
+        }
+        button.accent {
+            background: var(--accent);
+        }
+        button:hover { transform: translateY(-1px); }
+        button:active { transform: translateY(0); opacity: .92; }
+        .status {
+            padding: 8px 10px;
+            border-radius: 10px;
+            background: #eef7f5;
+            border: 1px solid #c7e4de;
+            color: var(--ok);
+            font-weight: 600;
+            min-height: 40px;
+            display: flex;
+            align-items: center;
+        }
+        .status.error {
+            background: #fff1ea;
+            border-color: #ffc8ae;
+            color: var(--warn);
+        }
+        pre {
+            margin: 0;
+            white-space: pre-wrap;
+            background: #0f1b24;
+            color: #d9efe9;
+            border-radius: 10px;
+            padding: 12px;
+            max-height: 340px;
+            overflow: auto;
+            font-family: 'IBM Plex Mono', monospace;
+            font-size: .85rem;
+            border: 1px solid #21313f;
+        }
+        .email-block {
+            background: #fff;
+            border: 1px solid #d9ccbc;
+            border-radius: 10px;
+            padding: 12px;
+        }
+        .email-row {
+            margin-bottom: 8px;
+            font-size: .95rem;
+            line-height: 1.35;
+        }
+        .email-row strong {
+            display: inline-block;
+            min-width: 66px;
+        }
+        .help {
+            margin: 0 0 10px;
+            font-size: .9rem;
+            opacity: .8;
+        }
+        .metric {
+            display: flex;
+            justify-content: space-between;
+            align-items: center;
+            margin-bottom: 8px;
+            padding-bottom: 6px;
+            border-bottom: 1px dashed #dbcfbe;
+            font-size: .95rem;
+        }
+        .metric strong {
+            font-weight: 700;
+        }
+        .coach {
+            background: #fff7ed;
+            border: 1px solid #f2caa9;
+            border-radius: 10px;
+            padding: 10px;
+            min-height: 74px;
+            line-height: 1.4;
+            font-size: .92rem;
+        }
+        .chip-row {
+            display: flex;
+            flex-wrap: wrap;
+            gap: 8px;
+            margin-top: 10px;
+        }
+        .chip {
+            background: #eaf3ff;
+            border: 1px solid #b9d1ef;
+            color: #184469;
+            border-radius: 999px;
+            padding: 6px 10px;
+            font-size: .84rem;
+            cursor: pointer;
+            font-weight: 600;
+        }
+    </style>
+</head>
+<body>
+    <div class="wrap">
+        <div class="title">
+            <div>
+                <h1>Inbox Helper Practice</h1>
+                <p class="subtitle">Practice deciding priority, category, and who should handle each email.</p>
+            </div>
+            <span class="badge" id="badge">connecting...</span>
+        </div>
+        <div class="grid">
+            <section class="card">
+                <h2>Start a Scenario</h2>
+                <p class="help">Pick a difficulty, then click Start.</p>
+                <div class="row">
+                    <select id="taskId">
+                        <option value="task_easy">Easy: one clear email</option>
+                        <option value="task_medium">Medium: mixed inbox</option>
+                        <option value="task_hard">Hard: high-risk complaint</option>
+                        <option value="task_production">Production: full inbox simulator</option>
+                    </select>
+                </div>
+                <div id="productionControls" style="display:none;">
+                    <div class="row">
+                        <select id="productionProfile">
+                            <option value="light">Workload: Light</option>
+                            <option value="standard" selected>Workload: Standard</option>
+                            <option value="heavy">Workload: Heavy</option>
+                        </select>
+                    </div>
+                    <div class="row">
+                        <select id="businessHoursMode">
+                            <option value="false" selected>Time Profile: 24x7 inbox</option>
+                            <option value="true">Time Profile: business hours focus</option>
+                        </select>
+                    </div>
+                    <div class="row">
+                        <select id="escalationMode">
+                            <option value="low">Escalation: Low</option>
+                            <option value="normal" selected>Escalation: Normal</option>
+                            <option value="high">Escalation: High</option>
+                        </select>
+                    </div>
+                </div>
+                <div class="row">
+                    <button class="accent" id="btnReset">Start</button>
+                    <button class="secondary" id="btnState">Check Progress</button>
+                </div>
+                <div class="status" id="status">Ready. Start a scenario.</div>
+            </section>
+            <section class="card">
+                <h2>Your Decision</h2>
+                <p class="help">Choose priority, who should handle it, and a short reason.</p>
+                <div class="row">
+                    <select id="label">
+                        <option value="urgent">Urgent</option>
+                        <option value="normal" selected>Normal</option>
+                        <option value="spam">Spam</option>
+                        <option value="archive">Archive</option>
+                    </select>
+                </div>
+                <div class="row">
+                    <input id="routeTo" placeholder="Who should handle this? (billing, safety, engineering, support)" value="general" />
+                </div>
+                <div class="row">
+                    <textarea id="summary" placeholder="Write one clear sentence with key clues from the email.">Needs review.</textarea>
+                </div>
+                <div class="row">
+                    <button id="btnStep">Send Decision</button>
+                </div>
+            </section>
+            <section class="card wide">
+                <h2>Current Email</h2>
+                <div class="email-block">
+                    <div class="email-row"><strong>Subject:</strong> <span id="mailSubject">No email loaded yet.</span></div>
+                    <div class="email-row"><strong>From:</strong> <span id="mailSender">-</span></div>
+                    <div class="email-row"><strong>Message:</strong> <span id="mailBody">Start a scenario to load an email.</span></div>
+                </div>
+            </section>
+            <section class="card">
+                <h2>Live Progress</h2>
+                <div class="metric"><span>Task</span><strong id="insightTask">-</strong></div>
+                <div class="metric"><span>Scenario</span><strong id="insightScenario">-</strong></div>
+                <div class="metric"><span>Progress</span><strong id="insightProgress">0/0</strong></div>
+                <div class="metric"><span>Last Reward</span><strong id="insightReward">-</strong></div>
+                <div class="metric"><span>Base Score</span><strong id="insightBase">-</strong></div>
+            </section>
+            <section class="card">
+                <h2>Coach Notes</h2>
+                <p class="help">Use this to improve your next triage action.</p>
+                <div class="coach" id="coachNotes">Start a scenario and submit one decision to get feedback.</div>
+                <div class="chip-row">
+                    <button class="chip" id="chipSafety">Quick Fill: Urgent + Safety</button>
+                    <button class="chip" id="chipBilling">Quick Fill: Normal + Billing</button>
+                    <button class="chip" id="chipSpam">Quick Fill: Spam + General</button>
+                </div>
+            </section>
+            <section class="card wide">
+                <h2>Details (Advanced)</h2>
+                <pre id="output">Waiting for your first action...</pre>
+            </section>
+        </div>
+    </div>
+    <script>
+        const statusEl = document.getElementById('status');
+        const badgeEl = document.getElementById('badge');
+        const outEl = document.getElementById('output');
+        const mailSubjectEl = document.getElementById('mailSubject');
+        const mailSenderEl = document.getElementById('mailSender');
+        const mailBodyEl = document.getElementById('mailBody');
+        const taskIdEl = document.getElementById('taskId');
+        const productionControlsEl = document.getElementById('productionControls');
+        const insightTaskEl = document.getElementById('insightTask');
+        const insightScenarioEl = document.getElementById('insightScenario');
+        const insightProgressEl = document.getElementById('insightProgress');
+        const insightRewardEl = document.getElementById('insightReward');
+        const insightBaseEl = document.getElementById('insightBase');
+        const coachNotesEl = document.getElementById('coachNotes');
+        function setStatus(msg, isError = false) {
+            statusEl.textContent = msg;
+            statusEl.classList.toggle('error', isError);
+        }
+        function writeOutput(value) {
+            outEl.textContent = typeof value === 'string' ? value : JSON.stringify(value, null, 2);
+        }
+        function updateEmailPanel(data) {
+            if (!data || !data.observation) {
+                return;
+            }
+            const obs = data.observation;
+            mailSubjectEl.textContent = obs.subject || 'No subject';
+            mailSenderEl.textContent = obs.sender || '-';
+            mailBodyEl.textContent = obs.body || '';
+        }
+        function updateProductionControlsVisibility() {
+            const isProduction = taskIdEl.value === 'task_production';
+            productionControlsEl.style.display = isProduction ? 'block' : 'none';
+        }
+        function safeNumber(value) {
+            return typeof value === 'number' && !Number.isNaN(value) ? value : null;
+        }
+        function updateInsights(data) {
+            const info = (data && data.info) ? data.info : {};
+            const taskValue = info.task_id || data.task_id || (data.observation && data.observation.task_id) || '-';
+            const scenarioValue = info.scenario_id || '-';
+            insightTaskEl.textContent = taskValue;
+            insightScenarioEl.textContent = scenarioValue;
+            const emailsProcessed = safeNumber(info.emails_processed);
+            const emailsTotal = safeNumber(info.emails_total);
+            if (emailsProcessed !== null && emailsTotal !== null) {
+                insightProgressEl.textContent = `${emailsProcessed}/${emailsTotal}`;
+            } else if (safeNumber(data.current_step) !== null && safeNumber(data.total_steps) !== null) {
+                insightProgressEl.textContent = `${data.current_step}/${data.total_steps}`;
+            }
+            const rewardValue = safeNumber(data.reward);
+            insightRewardEl.textContent = rewardValue !== null ? rewardValue.toFixed(6) : '-';
+            const baseScoreValue = safeNumber(info.base_score);
+            insightBaseEl.textContent = baseScoreValue !== null ? baseScoreValue.toFixed(6) : '-';
+            const tips = [];
+            if (info.validation_error) {
+                tips.push('Action format is invalid. Keep label/summary/route_to filled correctly.');
+            }
+            const routeNoise = safeNumber(info.grade_route_noise_penalty);
+            if (routeNoise !== null && routeNoise > 0.01) {
+                tips.push('Route to one best owner team. Avoid sending to many teams at once.');
+            }
+            const summaryMatch = safeNumber(info.grade_summary_match);
+            if (summaryMatch !== null && summaryMatch < 0.6) {
+                tips.push('Summary is weak. Include concrete clues from subject/body/thread.');
+            }
+            const labelMatch = safeNumber(info.grade_label_match);
+            if (labelMatch !== null && labelMatch < 1.0) {
+                tips.push('Priority label may be off. Re-check urgency and risk signals.');
+            }
+            const routeMatch = safeNumber(info.grade_route_match);
+            if (routeMatch !== null && routeMatch < 1.0) {
+                tips.push('Routing looks off. Pick the team that directly owns this issue.');
+            }
+            const urgencyComponent = safeNumber(info.grade_urgency_component);
+            if (urgencyComponent !== null && urgencyComponent < 0.2) {
+                tips.push('For high-risk complaints, mark urgent and route to safety first.');
+            }
+            if (!tips.length && typeof info.grading_feedback === 'string' && info.grading_feedback) {
+                tips.push(info.grading_feedback);
+            }
+            coachNotesEl.textContent = tips.length
+                ? tips.join(' ')
+                : 'Looks good. Keep your next route precise and your summary evidence-based.';
+        }
+        function prefillAction(label, routeTo, summary) {
+            document.getElementById('label').value = label;
+            document.getElementById('routeTo').value = routeTo;
+            document.getElementById('summary').value = summary;
+        }
+        async function postJson(path, payload) {
+            const response = await fetch(path, {
+                method: 'POST',
+                headers: { 'Content-Type': 'application/json' },
+                body: JSON.stringify(payload || {}),
+            });
+            const text = await response.text();
+            let data = text;
+            try { data = JSON.parse(text); } catch (e) {}
+            if (!response.ok) {
+                throw new Error('HTTP ' + response.status + ' - ' + text);
+            }
+            return data;
+        }
+        async function warmup() {
+            try {
+                const res = await fetch('/meta');
+                const data = await res.json();
+                badgeEl.textContent = data.status === 'ok' ? 'ready' : 'check service';
+            } catch (e) {
+                badgeEl.textContent = 'offline';
+            }
+        }
+        document.getElementById('btnReset').addEventListener('click', async () => {
+            const taskId = taskIdEl.value;
+            setStatus('Starting a new scenario...');
+            try {
+                const payload = { task_id: taskId };
+                if (taskId === 'task_production') {
+                    payload.production_profile = document.getElementById('productionProfile').value;
+                    payload.business_hours_mode = document.getElementById('businessHoursMode').value === 'true';
+                    payload.escalation_mode = document.getElementById('escalationMode').value;
+                }
+                const data = await postJson('/reset', payload);
+                setStatus('Scenario started. Read the email below.');
+                updateEmailPanel(data);
+                updateInsights(data);
+                writeOutput(data);
+            } catch (e) {
+                setStatus('Could not start scenario. See details below.', true);
+                writeOutput(String(e));
+            }
+        });
+        document.getElementById('btnState').addEventListener('click', async () => {
+            setStatus('Checking progress...');
+            try {
+                const data = await postJson('/state', {});
+                setStatus('Progress updated.');
+                updateInsights(data);
+                writeOutput(data);
+            } catch (e) {
+                setStatus('Could not fetch progress. See details below.', true);
+                writeOutput(String(e));
+            }
+        });
+        document.getElementById('btnStep').addEventListener('click', async () => {
+            const payload = {
+                label: document.getElementById('label').value,
+                summary: document.getElementById('summary').value,
+                route_to: document.getElementById('routeTo').value,
+            };
+            setStatus('Sending your decision...');
+            try {
+                const data = await postJson('/step', payload);
+                setStatus('Decision saved.');
+                updateEmailPanel(data);
+                updateInsights(data);
+                writeOutput(data);
+            } catch (e) {
+                setStatus('Could not submit decision. See details below.', true);
+                writeOutput(String(e));
+            }
+        });
+        document.getElementById('chipSafety').addEventListener('click', () => {
+            prefillAction('urgent', 'safety', 'Potential safety risk with immediate escalation needed.');
+        });
+        document.getElementById('chipBilling').addEventListener('click', () => {
+            prefillAction('normal', 'billing', 'Customer billing issue needs finance team review and response.');
+        });
+        document.getElementById('chipSpam').addEventListener('click', () => {
+            prefillAction('spam', 'general', 'Likely phishing or irrelevant message with suspicious external request.');
+        });
+        taskIdEl.addEventListener('change', updateProductionControlsVisibility);
+        updateProductionControlsVisibility();
+        warmup();
+    </script>
+</body>
+</html>
+"""
+app = Flask(__name__)
+current_env = EmailTriageEnv(task_id="task_easy")
+SCENARIO_COUNTERS = {task_id: 0 for task_id in list_task_ids()}
+DEFAULT_EVAL_SPLIT = os.getenv("OPENENV_EVAL_SPLIT", "public")
+ALLOW_CLIENT_EVAL_OVERRIDE = (
+    os.getenv("OPENENV_ALLOW_CLIENT_EVAL_OVERRIDE", "false").strip().lower() == "true"
+)
+@app.get("/")
+def root_page():
+    """Render a lightweight frontend for interacting with the environment."""
+    return Response(FRONTEND_HTML, mimetype="text/html")
+@app.get("/meta")
+def root_endpoint():
+    """Return service metadata for health checks and machine clients."""
+    return jsonify(
+        {
+            "name": "email-triage-env",
+            "status": "ok",
+            "endpoints": {
+                "reset": {"method": "POST", "path": "/reset"},
+                "step": {"method": "POST", "path": "/step"},
+                "state": {"method": "POST", "path": "/state"},
+            },
+            "scenario_pools": {
+                "public": {
+                    task_id: get_task_scenario_count(task_id, "public")
+                    for task_id in list_task_ids()
+                },
+            },
+            "eval_split": DEFAULT_EVAL_SPLIT,
+            "production_runtime_controls": {
+                "production_profile": ["light", "standard", "heavy"],
+                "business_hours_mode": [True, False],
+                "escalation_mode": ["low", "normal", "high"],
+            },
+        }
+    )
+@app.post("/reset")
+def reset_endpoint():
+    """Reset the environment with a selected task and return ResetResult JSON.
+    Returns:
+        Flask response containing reset payload.
+    """
+    global current_env
+    global SCENARIO_COUNTERS
+    payload = request.get_json(silent=True)
+    if payload is None:
+        payload = {}
+    elif not isinstance(payload, dict):
+        return jsonify({"error": "Malformed JSON payload."}), 400
+    task_id = payload.get("task_id", "task_easy")
+    if not isinstance(task_id, str):
+        return jsonify({"error": "Field 'task_id' must be a string."}), 400
+    runtime_options: dict[str, object] = {}
+    if task_id == "task_production":
+        production_profile = payload.get("production_profile", "standard")
+        if not isinstance(production_profile, str) or production_profile not in {
+            "light",
+            "standard",
+            "heavy",
+        }:
+            return (
+                jsonify(
+                    {
+                        "error": (
+                            "Field 'production_profile' must be one of "
+                            "light/standard/heavy."
+                        )
+                    }
+                ),
+                400,
+            )
+        escalation_mode = payload.get("escalation_mode", "normal")
+        if not isinstance(escalation_mode, str) or escalation_mode not in {
+            "low",
+            "normal",
+            "high",
+        }:
+            return (
+                jsonify(
+                    {
+                        "error": (
+                            "Field 'escalation_mode' must be one of "
+                            "low/normal/high."
+                        )
+                    }
+                ),
+                400,
+            )
+        business_hours_mode = payload.get("business_hours_mode", False)
+        if isinstance(business_hours_mode, str):
+            business_hours_mode = business_hours_mode.strip().lower() in {
+                "1",
+                "true",
+                "yes",
+                "on",
+            }
+        elif not isinstance(business_hours_mode, bool):
+            return jsonify({"error": "Field 'business_hours_mode' must be boolean."}), 400
+        runtime_options = {
+            "production_profile": production_profile,
+            "business_hours_mode": business_hours_mode,
+            "escalation_mode": escalation_mode,
+        }
+    if not ALLOW_CLIENT_EVAL_OVERRIDE and (
+        "eval_split" in payload or "scenario_index" in payload
+    ):
+        return jsonify(
+            {
+                "error": (
+                    "Client overrides for eval_split/scenario_index are disabled "
+                    "by server policy."
+                )
+            }
+        ), 400
+    eval_split = DEFAULT_EVAL_SPLIT
+    if ALLOW_CLIENT_EVAL_OVERRIDE:
+        requested_split = payload.get("eval_split", DEFAULT_EVAL_SPLIT)
+        if not isinstance(requested_split, str):
+            return jsonify({"error": "Field 'eval_split' must be a string."}), 400
+        eval_split = requested_split
+    requested_index = payload.get("scenario_index") if ALLOW_CLIENT_EVAL_OVERRIDE else None
+    if requested_index is not None and (not isinstance(requested_index, int) or requested_index < 0):
+        return jsonify({"error": "Field 'scenario_index' must be a non-negative integer."}), 400
+    try:
+        scenario_count = get_task_scenario_count(task_id, eval_split)
+        if requested_index is None:
+            scenario_index = SCENARIO_COUNTERS.get(task_id, 0)
+            if scenario_count > 0:
+                SCENARIO_COUNTERS[task_id] = (scenario_index + 1) % scenario_count
+        else:
+            scenario_index = requested_index
+        current_env = EmailTriageEnv(
+            task_id=task_id,
+            scenario_index=scenario_index,
+            split=eval_split,
+            runtime_options=runtime_options,
+        )
+        reset_result = current_env.reset()
+    except KeyError as error:
+        return jsonify({"error": str(error)}), 400
+    return jsonify(reset_result.model_dump())
+@app.post("/step")
+def step_endpoint():
+    """Advance environment by one action and return StepResult JSON.
+    Returns:
+        Flask response containing step payload.
+    """
+    payload = request.get_json(silent=True)
+    if payload is None:
+        return jsonify({"error": "Malformed JSON payload."}), 400
+    step_result = current_env.step(payload)
+    return jsonify(step_result.model_dump())
+@app.post("/state")
+def state_endpoint():
+    """Return read-only EnvironmentState JSON snapshot.
+    Returns:
+        Flask response containing state payload.
+    """
+    state_result = current_env.state()
+    return jsonify(state_result.model_dump())
+def main() -> None:
+    """Run the Flask app for local and script-based launches."""
+    app.run(host="0.0.0.0", port=7860)
+if __name__ == "__main__":
+    main()

environment.py ADDED Viewed

	@@ -0,0 +1,469 @@

+"""Core OpenEnv email triage environment implementation."""
+import os
+from typing import cast
+from pydantic import ValidationError
+from graders import SCORE_EPSILON, grade_easy, grade_hard, grade_medium_step
+from models import (
+    EmailObservation,
+    EnvironmentState,
+    ResetResult,
+    RewardResult,
+    StepResult,
+    TriageAction,
+)
+from tasks import get_task_definition
+class EmailTriageEnv:
+    """Deterministic email triage environment implementing reset, step, and state."""
+    def __init__(
+        self,
+        task_id: str,
+        scenario_index: int = 0,
+        split: str | None = None,
+        runtime_options: dict[str, object] | None = None,
+    ) -> None:
+        """Initialize environment with a selected task.
+        Args:
+            task_id: Task identifier such as task_easy, task_medium, or task_hard.
+            scenario_index: Deterministic scenario index within the task pool.
+            split: Scenario split, either public or private_eval.
+            runtime_options: Optional deterministic runtime controls for task generation.
+        """
+        self.task_id = task_id
+        self._episode_index = max(0, scenario_index)
+        self.split = split or os.getenv("OPENENV_EVAL_SPLIT", "public")
+        self.runtime_options = runtime_options or {}
+        self._task_definition = get_task_definition(
+            task_id,
+            self._episode_index,
+            self.split,
+            self.runtime_options,
+        )
+        self._scenario_id = str(self._task_definition.get("scenario_id", "unknown"))
+        self._emails = cast(list[dict[str, object]], self._task_definition.get("emails", []))
+        self._ground_truth = cast(
+            list[dict[str, object]], self._task_definition.get("ground_truth", [])
+        )
+        self._current_index = 0
+        self._current_step = 0
+        self._done = False
+        self._max_steps = max(10, len(self._emails) + 5)
+        self._action_history: list[TriageAction] = []
+        self._reward_history: list[float] = []
+        self._base_score_history: list[float] = []
+        self._generated_followups = 0
+        self._max_generated_followups = 4
+        self._followup_quality_threshold = 0.7
+        self._configure_runtime_controls()
+    def reset(self) -> ResetResult:
+        """Reset episode state and return the first observation.
+        Returns:
+            ResetResult containing first observation and metadata.
+        """
+        self._task_definition = get_task_definition(
+            self.task_id,
+            self._episode_index,
+            self.split,
+            self.runtime_options,
+        )
+        self._scenario_id = str(self._task_definition.get("scenario_id", "unknown"))
+        self._emails = cast(list[dict[str, object]], self._task_definition.get("emails", []))
+        self._ground_truth = cast(
+            list[dict[str, object]], self._task_definition.get("ground_truth", [])
+        )
+        self._current_index = 0
+        self._current_step = 0
+        self._done = False
+        self._max_steps = max(10, len(self._emails) + 5)
+        self._action_history = []
+        self._reward_history = []
+        self._base_score_history = []
+        self._generated_followups = 0
+        self._configure_runtime_controls()
+        self._episode_index += 1
+        first_observation = self._build_observation(self._current_index)
+        return ResetResult(
+            observation=first_observation,
+            info={
+                "task_id": self.task_id,
+                "scenario_id": self._scenario_id,
+                "split": self.split,
+                "step": self._current_step,
+                "emails_total": len(self._emails),
+                "task_description": str(self._task_definition.get("description", "")),
+            },
+        )
+    def step(self, action: TriageAction) -> StepResult:
+        """Apply an action and return StepResult.
+        Args:
+            action: Proposed triage action.
+        Returns:
+            StepResult with next observation, reward, done flag, and metadata.
+        """
+        if self._done:
+            return StepResult(
+                observation=self._terminal_observation(),
+                reward=SCORE_EPSILON,
+                done=True,
+                info={
+                    "task_id": self.task_id,
+                    "scenario_id": self._scenario_id,
+                    "split": self.split,
+                    "step": self._current_step,
+                    "already_done": True,
+                },
+            )
+        try:
+            validated_action = TriageAction.model_validate(action)
+        except ValidationError as validation_error:
+            self._current_step += 1
+            self._reward_history.append(SCORE_EPSILON)
+            self._done = self._current_step >= self._max_steps
+            return StepResult(
+                observation=self._build_observation(self._current_index),
+                reward=SCORE_EPSILON,
+                done=self._done,
+                info={
+                    "task_id": self.task_id,
+                    "scenario_id": self._scenario_id,
+                    "split": self.split,
+                    "step": self._current_step,
+                    "emails_total": len(self._emails),
+                    "emails_processed": self._current_index,
+                    "emails_remaining": max(len(self._emails) - self._current_index, 0),
+                    "validation_error": str(validation_error),
+                },
+            )
+        base_result = self._grade_current_step(validated_action)
+        base_score = base_result.score
+        previous_base_score = self._base_score_history[-1] if self._base_score_history else None
+        progress_signal = self._compute_progress_signal(base_score, previous_base_score)
+        truth_for_step = (
+            self._ground_truth[min(self._current_index, len(self._ground_truth) - 1)]
+            if self._ground_truth
+            else {}
+        )
+        self._maybe_enqueue_follow_up(validated_action, truth_for_step, base_score)
+        self._action_history.append(validated_action)
+        self._base_score_history.append(base_score)
+        self._current_step += 1
+        penalties = self._compute_penalties(validated_action)
+        trajectory_bonus = self._compute_trajectory_bonus()
+        step_cost = self._compute_step_cost()
+        final_reward = self._clip_reward(
+            base_score + progress_signal + trajectory_bonus - penalties - step_cost
+        )
+        self._reward_history.append(final_reward)
+        if self._current_index < len(self._emails):
+            self._current_index += 1
+        all_emails_processed = self._current_index >= len(self._emails)
+        self._done = all_emails_processed or self._current_step >= self._max_steps
+        next_observation = (
+            self._terminal_observation()
+            if self._done
+            else self._build_observation(self._current_index)
+        )
+        info = {
+            "task_id": self.task_id,
+            "scenario_id": self._scenario_id,
+            "split": self.split,
+            "step": self._current_step,
+            "emails_total": len(self._emails),
+            "emails_processed": min(self._current_index, len(self._emails)),
+            "emails_remaining": max(len(self._emails) - self._current_index, 0),
+            "base_score": float(base_score),
+            "progress_signal": float(progress_signal),
+            "step_cost": float(step_cost),
+            "penalties": float(penalties),
+            "trajectory_bonus": float(trajectory_bonus),
+            "grading_feedback": base_result.feedback,
+        }
+        for breakdown_key, breakdown_value in base_result.breakdown.items():
+            if isinstance(breakdown_value, (int, float)):
+                info[f"grade_{breakdown_key}"] = float(breakdown_value)
+        return StepResult(
+            observation=next_observation,
+            reward=final_reward,
+            done=self._done,
+            info=info,
+        )
+    def _maybe_enqueue_follow_up(
+        self,
+        action: TriageAction,
+        truth: dict[str, object],
+        base_score: float,
+    ) -> None:
+        """Insert deterministic escalation follow-up emails for production mode."""
+        if self.task_id != "task_production":
+            return
+        if self._generated_followups >= self._max_generated_followups:
+            return
+        if not self._emails:
+            return
+        expected_label = str(truth.get("label", ""))
+        expected_route = str(truth.get("route_to", "general"))
+        is_missed_critical = (
+            expected_label == "urgent"
+            and (action.label != "urgent" or expected_route not in action.route_to.lower())
+        )
+        if not is_missed_critical and base_score >= self._followup_quality_threshold:
+            return
+        source_email = self._emails[min(self._current_index, len(self._emails) - 1)]
+        source_subject = str(source_email.get("subject", "Inbox incident"))
+        source_timestamp = str(source_email.get("timestamp", "2026-04-03T00:00:00Z"))
+        followup_email = {
+            "email_id": f"followup-{self._scenario_id}-{self._generated_followups + 1}",
+            "subject": f"Escalation follow-up: {source_subject}",
+            "body": (
+                "Automated escalation triggered because prior triage appears incomplete. "
+                "Please route to the responsible team and provide a clear summary now."
+            ),
+            "sender": "incident-control@acme-enterprise.com",
+            "timestamp": source_timestamp,
+            "thread_history": [f"Previous message subject: {source_subject}"],
+        }
+        followup_truth = {
+            "label": "urgent",
+            "route_to": expected_route,
+            "priority_weight": min(max(float(truth.get("priority_weight", 1.5)) + 0.2, 1.5), 2.0),
+            "summary_keywords": ["escalation", "follow-up", expected_route],
+        }
+        insert_at = min(self._current_index + 1, len(self._emails))
+        self._emails.insert(insert_at, followup_email)
+        self._ground_truth.insert(insert_at, followup_truth)
+        self._generated_followups += 1
+    def _configure_runtime_controls(self) -> None:
+        """Apply deterministic runtime control options for production simulator."""
+        if self.task_id != "task_production":
+            self._max_generated_followups = 4
+            self._followup_quality_threshold = 0.7
+            return
+        escalation_mode = str(self.runtime_options.get("escalation_mode", "normal")).lower()
+        escalation_map = {
+            "low": (2, 0.55),
+            "normal": (4, 0.7),
+            "high": (8, 0.85),
+        }
+        max_followups, threshold = escalation_map.get(escalation_mode, escalation_map["normal"])
+        self._max_generated_followups = max_followups
+        self._followup_quality_threshold = threshold
+    def state(self) -> EnvironmentState:
+        """Return read-only snapshot of full internal state.
+        Returns:
+            EnvironmentState with progress and history.
+        """
+        return EnvironmentState(
+            task_id=self.task_id,
+            current_step=self._current_step,
+            total_steps=self._max_steps,
+            done=self._done,
+            action_history=list(self._action_history),
+            reward_history=list(self._reward_history),
+        )
+    def _build_observation(self, email_index: int) -> EmailObservation:
+        """Build observation for the email at a given index.
+        Args:
+            email_index: Zero-based email index.
+        Returns:
+            EmailObservation for the selected email or terminal placeholder.
+        """
+        if not self._emails:
+            return self._terminal_observation()
+        safe_index = min(max(email_index, 0), len(self._emails) - 1)
+        email_payload = self._emails[safe_index]
+        return EmailObservation(
+            email_id=str(email_payload.get("email_id", "")),
+            subject=str(email_payload.get("subject", "")),
+            body=str(email_payload.get("body", "")),
+            sender=str(email_payload.get("sender", "")),
+            timestamp=str(email_payload.get("timestamp", "")),
+            thread_history=[str(item) for item in email_payload.get("thread_history", [])],
+            task_id=self.task_id,
+            step_number=self._current_step,
+            total_emails=len(self._emails),
+        )
+    def _terminal_observation(self) -> EmailObservation:
+        """Build terminal observation returned when episode is complete.
+        Returns:
+            Terminal EmailObservation payload.
+        """
+        return EmailObservation(
+            email_id="terminal",
+            subject="Episode complete",
+            body="No further emails remain for this task.",
+            sender="system",
+            timestamp="",
+            thread_history=[],
+            task_id=self.task_id,
+            step_number=self._current_step,
+            total_emails=len(self._emails),
+        )
+    def _grade_current_step(self, action: TriageAction) -> RewardResult:
+        """Select deterministic grader based on task and current progress.
+        Args:
+            action: Validated action for the current step.
+        Returns:
+            RewardResult from task-specific grader.
+        """
+        if not self._ground_truth:
+            return RewardResult(
+                score=SCORE_EPSILON,
+                breakdown={"missing_ground_truth": 1.0 - SCORE_EPSILON},
+                feedback="Missing ground truth for task.",
+            )
+        if self.task_id == "task_easy":
+            truth = self._ground_truth[min(self._current_index, len(self._ground_truth) - 1)]
+            return grade_easy(action, truth)
+        if self.task_id == "task_medium":
+            truth = self._ground_truth[min(self._current_index, len(self._ground_truth) - 1)]
+            return grade_medium_step(action, truth)
+        truth = self._ground_truth[min(self._current_index, len(self._ground_truth) - 1)]
+        return grade_hard(action, truth)
+    def _compute_penalties(self, action: TriageAction) -> float:
+        """Compute deterministic penalties according to reward policy.
+        Args:
+            action: Validated action for the step.
+        Returns:
+            Total penalty value for current step.
+        """
+        penalty_total = 0.0
+        summary_too_short = len(action.summary.strip()) < 10
+        if action.label == "archive" and summary_too_short:
+            penalty_total += 0.5
+        if self._is_repeated_action_pattern(action):
+            penalty_total += 0.3
+        return penalty_total
+    def _compute_progress_signal(
+        self,
+        base_score: float,
+        previous_base_score: float | None,
+    ) -> float:
+        """Compute dense partial-progress reward independent of final completion.
+        Args:
+            base_score: Current-step base grade in [0.0, 1.0].
+            previous_base_score: Previous step base grade when available.
+        Returns:
+            Small positive/negative signal reflecting progress and quality trend.
+        """
+        total_emails = max(len(self._emails), 1)
+        progress_ratio = min(1.0, (self._current_index + 1) / total_emails)
+        completion_signal = 0.05 * progress_ratio
+        quality_signal = 0.05 * self._clip_reward(base_score)
+        trend_signal = 0.0
+        if previous_base_score is not None:
+            delta = base_score - previous_base_score
+            trend_signal = max(-0.02, min(0.03, delta * 0.1))
+        return completion_signal + quality_signal + trend_signal
+    def _compute_step_cost(self) -> float:
+        """Return a gentle efficiency cost that grows with episode length."""
+        normalized_step = self._current_step / max(self._max_steps, 1)
+        return 0.005 + (0.01 * normalized_step)
+    def _compute_trajectory_bonus(self) -> float:
+        """Return trajectory bonus when episode completion quality is high.
+        Returns:
+            0.2 when mean base score is above threshold at completion, else 0.0.
+        """
+        if not self._base_score_history:
+            return 0.0
+        all_emails_done_after_step = self._current_index + 1 >= len(self._emails)
+        if not all_emails_done_after_step:
+            return 0.0
+        mean_base = sum(self._base_score_history) / len(self._base_score_history)
+        return 0.2 if mean_base > 0.8 else 0.0
+    def _is_repeated_action_pattern(self, action: TriageAction) -> bool:
+        """Detect whether same action appears three times consecutively.
+        Args:
+            action: Current action.
+        Returns:
+            True when repeated label and route occur three times in a row.
+        """
+        if len(self._action_history) < 2:
+            return False
+        previous_action = self._action_history[-1]
+        older_action = self._action_history[-2]
+        return (
+            previous_action.label == older_action.label == action.label
+            and previous_action.route_to.strip().lower()
+            == older_action.route_to.strip().lower()
+            == action.route_to.strip().lower()
+        )
+    def _clip_reward(self, reward_value: float) -> float:
+        """Clip reward to the strict range (0.0, 1.0).
+        Args:
+            reward_value: Raw reward value.
+        Returns:
+            Clipped reward.
+        """
+        return max(SCORE_EPSILON, min(1.0 - SCORE_EPSILON, reward_value))

graders.py ADDED Viewed

	@@ -0,0 +1,319 @@

+"""Deterministic graders for OpenEnv email triage tasks."""
+import re
+from models import RewardResult, TriageAction
+ROUTE_ALIAS_MAP = {
+    "billing": ["billing", "finance", "payments", "accounts"],
+    "safety": ["safety", "compliance", "risk"],
+    "engineering": ["engineering", "eng", "sre", "platform", "on-call"],
+    "support": ["support", "helpdesk", "customer support"],
+    "general": ["general", "inbox", "operations"],
+}
+SCORE_EPSILON = 1e-6
+def _strict_binary_score(is_positive_case: bool) -> float:
+    """Return strict in-range score for binary outcomes."""
+    return 1.0 - SCORE_EPSILON if is_positive_case else SCORE_EPSILON
+def _strict_ratio_score(raw_value: float) -> float:
+    """Return strict in-range score for ratio-like metrics."""
+    return _clip_score(raw_value)
+def _clip_score(score_value: float) -> float:
+    """Clip a score to the strict range (0.0, 1.0).
+    Args:
+        score_value: Raw score.
+    Returns:
+        Clipped score.
+    """
+    clipped = max(0.0, min(1.0, score_value))
+    if clipped <= 0.0:
+        return SCORE_EPSILON
+    if clipped >= 1.0:
+        return 1.0 - SCORE_EPSILON
+    return clipped
+def _normalized_text(text_value: str) -> str:
+    """Return normalized lowercase text for deterministic comparisons.
+    Args:
+        text_value: Input text.
+    Returns:
+        Normalized text.
+    """
+    return text_value.strip().lower()
+def _route_matches(action_route: str, expected_route: str) -> bool:
+    """Check if action route contains the expected route token.
+    Args:
+        action_route: Route provided by agent.
+        expected_route: Route expected by ground truth.
+    Returns:
+        True when expected route is present in the action route.
+    """
+    normalized_expected = _normalized_text(expected_route)
+    if not normalized_expected:
+        return False
+    return normalized_expected in _canonical_route_tokens(action_route)
+def _canonical_route_tokens(action_route: str) -> set[str]:
+    """Map free-form route text to canonical route categories."""
+    normalized_action = _normalized_text(action_route)
+    if not normalized_action:
+        return set()
+    route_fragments = [
+        fragment.strip()
+        for fragment in re.split(r"[,;/|]+", normalized_action)
+        if fragment.strip()
+    ]
+    canonical: set[str] = set()
+    for fragment in route_fragments:
+        for route_name, aliases in ROUTE_ALIAS_MAP.items():
+            if any(alias in fragment for alias in aliases):
+                canonical.add(route_name)
+                break
+    # Fallback for phrases without separators.
+    if not canonical:
+        for route_name, aliases in ROUTE_ALIAS_MAP.items():
+            if any(alias in normalized_action for alias in aliases):
+                canonical.add(route_name)
+    return canonical
+def _route_noise_penalty(action_route: str) -> float:
+    """Penalize over-routing to many teams in one action."""
+    route_count = len(_canonical_route_tokens(action_route))
+    if route_count <= 2:
+        return 0.0
+    return min(0.24, 0.08 * (route_count - 2))
+def _summary_keyword_score(summary_text: str, ground_truth: dict) -> float:
+    """Score summary quality using deterministic keyword overlap.
+    Args:
+        summary_text: Summary text produced by the agent.
+        ground_truth: Ground-truth dict that may include summary keywords.
+    Returns:
+        Score in [0.0, 1.0] based on matched summary keywords.
+    """
+    raw_keywords = ground_truth.get("summary_keywords", [])
+    if not isinstance(raw_keywords, list):
+        return _strict_binary_score(len(summary_text.strip()) >= 10)
+    keywords = [
+        _normalized_text(str(keyword))
+        for keyword in raw_keywords
+        if _normalized_text(str(keyword))
+    ]
+    if not keywords:
+        return _strict_binary_score(len(summary_text.strip()) >= 10)
+    normalized_summary = _normalized_text(summary_text)
+    matches = 0
+    for keyword in keywords:
+        if keyword in normalized_summary:
+            matches += 1
+    base_score = matches / len(keywords)
+    # Discourage keyword stuffing and overly verbose summaries.
+    word_count = len(re.findall(r"[a-z0-9'-]+", normalized_summary))
+    if word_count < 4:
+        brevity_factor = 0.6
+    elif word_count <= 40:
+        brevity_factor = 1.0
+    else:
+        brevity_factor = max(0.45, 1.0 - (word_count - 40) * 0.02)
+    list_like_penalty = 0.85 if normalized_summary.count(",") >= 6 and matches >= 3 else 1.0
+    return _clip_score(base_score * brevity_factor * list_like_penalty)
+def grade_easy(action: TriageAction, ground_truth: dict) -> RewardResult:
+    """Grade easy task with deterministic partial credit.
+    Args:
+        action: Agent action for one email.
+        ground_truth: Expected label and route.
+    Returns:
+        Deterministic reward result in [0.0, 1.0].
+    """
+    expected_label = _normalized_text(str(ground_truth.get("label", "")))
+    expected_route = _normalized_text(str(ground_truth.get("route_to", "")))
+    label_correct = _normalized_text(action.label) == expected_label
+    route_correct = _route_matches(action.route_to, expected_route)
+    summary_score = _summary_keyword_score(action.summary, ground_truth)
+    noise_penalty = _route_noise_penalty(action.route_to)
+    score_value = (0.6 if label_correct else 0.0) + (0.25 if route_correct else 0.0)
+    score_value += 0.15 * summary_score
+    score_value -= noise_penalty
+    score_value = _clip_score(score_value)
+    breakdown = {
+        "label_match": _strict_binary_score(label_correct),
+        "route_match": _strict_binary_score(route_correct),
+        "summary_match": _strict_ratio_score(summary_score),
+        "route_noise_penalty": _strict_ratio_score(noise_penalty),
+    }
+    feedback = "Easy-task grading completed with context summary scoring."
+    return RewardResult(score=score_value, breakdown=breakdown, feedback=feedback)
+def grade_medium_step(action: TriageAction, truth: dict) -> RewardResult:
+    """Grade one medium-task step without cumulative history effects."""
+    expected_label = _normalized_text(str(truth.get("label", "")))
+    expected_route = _normalized_text(str(truth.get("route_to", "")))
+    priority_weight = max(float(truth.get("priority_weight", 1.0)), 0.1)
+    label_correct = _normalized_text(action.label) == expected_label
+    route_correct = _route_matches(action.route_to, expected_route)
+    summary_score = _summary_keyword_score(action.summary, truth)
+    noise_penalty = _route_noise_penalty(action.route_to)
+    per_email_score = (0.55 if label_correct else 0.0) + (0.3 if route_correct else 0.0)
+    per_email_score += 0.15 * summary_score
+    per_email_score -= noise_penalty
+    per_email_score = _clip_score(per_email_score)
+    weighted_step_score = _clip_score(per_email_score * min(priority_weight, 2.0))
+    return RewardResult(
+        score=weighted_step_score,
+        breakdown={
+            "label_match": _strict_binary_score(label_correct),
+            "route_match": _strict_binary_score(route_correct),
+            "summary_match": _strict_ratio_score(summary_score),
+            "priority_weight": _strict_ratio_score(min(priority_weight / 2.0, 1.0)),
+            "route_noise_penalty": _strict_ratio_score(noise_penalty),
+        },
+        feedback="Medium-task step grading completed.",
+    )
+def grade_medium(actions: list[TriageAction], ground_truths: list[dict]) -> RewardResult:
+    """Grade medium task using weighted per-email partial scoring.
+    Args:
+        actions: Agent actions for the medium task email queue.
+        ground_truths: Expected action details for each email.
+    Returns:
+        Deterministic reward result in [0.0, 1.0].
+    """
+    comparable_count = min(len(actions), len(ground_truths))
+    if comparable_count == 0:
+        return RewardResult(
+            score=SCORE_EPSILON,
+            breakdown={"emails_scored": SCORE_EPSILON, "weighted_average": SCORE_EPSILON},
+            feedback="No actions available for grading.",
+        )
+    weighted_score_sum = 0.0
+    weight_sum = 0.0
+    label_hits = 0
+    route_hits = 0
+    summary_total = 0.0
+    noise_penalty_total = 0.0
+    for index in range(comparable_count):
+        action = actions[index]
+        truth = ground_truths[index]
+        step_result = grade_medium_step(action, truth)
+        priority_weight = max(float(truth.get("priority_weight", 1.0)), 0.1)
+        weighted_score_sum += step_result.score
+        weight_sum += min(priority_weight, 2.0)
+        expected_label = _normalized_text(str(truth.get("label", "")))
+        expected_route = _normalized_text(str(truth.get("route_to", "")))
+        label_hits += 1 if _normalized_text(action.label) == expected_label else 0
+        route_hits += 1 if _route_matches(action.route_to, expected_route) else 0
+        summary_total += float(step_result.breakdown.get("summary_match", SCORE_EPSILON))
+        noise_penalty_total += float(
+            step_result.breakdown.get("route_noise_penalty", SCORE_EPSILON)
+        )
+    weighted_average = weighted_score_sum / weight_sum if weight_sum > 0.0 else 0.0
+    score_value = _clip_score(weighted_average)
+    breakdown = {
+        "emails_scored": _strict_ratio_score(float(comparable_count) / (comparable_count + 1.0)),
+        "label_accuracy": _strict_ratio_score(label_hits / comparable_count),
+        "route_accuracy": _strict_ratio_score(route_hits / comparable_count),
+        "summary_accuracy": _strict_ratio_score(summary_total / comparable_count),
+        "avg_route_noise_penalty": _strict_ratio_score(noise_penalty_total / comparable_count),
+        "weighted_average": score_value,
+    }
+    feedback = "Weighted medium-task grading completed."
+    return RewardResult(score=score_value, breakdown=breakdown, feedback=feedback)
+def grade_hard(action: TriageAction, ground_truth: dict) -> RewardResult:
+    """Grade hard task using weighted policy-sensitive components.
+    Args:
+        action: Agent action for hard task case.
+        ground_truth: Expected routing and urgency intent.
+    Returns:
+        Deterministic reward result in [0.0, 1.0].
+    """
+    expected_label = _normalized_text(str(ground_truth.get("label", "urgent")))
+    primary_route = _normalized_text(str(ground_truth.get("route_to", "safety")))
+    secondary_route = _normalized_text(str(ground_truth.get("cc_route", "billing")))
+    spam_penalty = float(ground_truth.get("penalize_spam", 0.2))
+    normalized_route = _normalized_text(action.route_to)
+    has_primary_route = _route_matches(normalized_route, primary_route)
+    has_secondary_route = _route_matches(normalized_route, secondary_route)
+    urgent_label = _normalized_text(action.label) == expected_label
+    summary_score = _summary_keyword_score(action.summary, ground_truth)
+    noise_penalty = _route_noise_penalty(action.route_to)
+    escalation_component = 0.35 if has_primary_route else 0.0
+    routing_component = 0.25 if has_secondary_route else 0.0
+    urgency_component = 0.25 if urgent_label else 0.0
+    summary_component = 0.15 * summary_score
+    raw_score = escalation_component + routing_component + urgency_component + summary_component
+    raw_score -= noise_penalty
+    if _normalized_text(action.label) == "spam":
+        raw_score -= spam_penalty
+    score_value = _clip_score(raw_score)
+    breakdown = {
+        "escalation_component": _strict_ratio_score(escalation_component),
+        "routing_component": _strict_ratio_score(routing_component),
+        "urgency_component": _strict_ratio_score(urgency_component),
+        "summary_component": _strict_ratio_score(summary_component),
+        "route_noise_penalty": _strict_ratio_score(noise_penalty),
+        "spam_penalty": _strict_ratio_score(
+            spam_penalty if _normalized_text(action.label) == "spam" else SCORE_EPSILON
+        ),
+    }
+    feedback = "Hard-task weighted policy grading completed."
+    return RewardResult(score=score_value, breakdown=breakdown, feedback=feedback)

inference.py ADDED Viewed

	@@ -0,0 +1,384 @@

+"""Inference script for OpenEnv email triage with strict stdout event format."""
+import argparse
+import json
+import os
+import re
+import time
+from typing import Any
+from openai import OpenAI
+from environment import EmailTriageEnv
+from models import EmailObservation, TriageAction
+API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
+MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
+HF_TOKEN = os.getenv("HF_TOKEN")
+API_KEY = HF_TOKEN or os.getenv("API_KEY")
+LOCAL_IMAGE_NAME = os.getenv("LOCAL_IMAGE_NAME")
+BENCHMARK = "openenv-email-triage"
+MAX_STEPS = 30
+TEMPERATURE = 0.2
+MAX_TOKENS = 200
+SUCCESS_SCORE_THRESHOLD = 0.5
+LOG_SCORE_EPSILON = 1e-6
+DEFAULT_RUNTIME_BUDGET_SECONDS = int(os.getenv("INFERENCE_RUNTIME_BUDGET_SECONDS", "1140"))
+DEFAULT_REQUEST_TIMEOUT_SECONDS = float(os.getenv("INFERENCE_REQUEST_TIMEOUT_SECONDS", "12"))
+SYSTEM_PROMPT = (
+    "You are an email triage assistant. For each email, prioritize risk/time impact, "
+    "categorize with one label (urgent|normal|spam|archive), route to the best team, "
+    "and summarize the key evidence. Return one JSON object with keys label, summary, route_to."
+)
+FALLBACK_ACTION = {
+    "label": "normal",
+    "summary": "Unable to parse response",
+    "route_to": "general",
+}
+TASK_MAP = {
+    "1": "task_easy",
+    "2": "task_medium",
+    "3": "task_hard",
+    "4": "task_production",
+}
+def parse_args() -> argparse.Namespace:
+    """Parse command-line arguments for task and optional model override."""
+    parser = argparse.ArgumentParser(description="Run OpenEnv email triage inference.")
+    parser.add_argument(
+        "--task",
+        default="all",
+        choices=["1", "2", "3", "4", "all"],
+        help="Task selection: 1, 2, 3, 4, or all.",
+    )
+    parser.add_argument(
+        "--model",
+        default=None,
+        help="Optional model override. Falls back to MODEL_NAME environment variable.",
+    )
+    parser.add_argument(
+        "--split",
+        default=os.getenv("OPENENV_EVAL_SPLIT", "public"),
+        choices=["public", "private_eval"],
+        help="Scenario split to evaluate.",
+    )
+    parser.add_argument(
+        "--episodes-per-task",
+        default=1,
+        type=int,
+        help="Number of deterministic scenarios to evaluate per task.",
+    )
+    parser.add_argument(
+        "--runtime-budget-seconds",
+        default=DEFAULT_RUNTIME_BUDGET_SECONDS,
+        type=int,
+        help="Global wall-clock budget for the full script run.",
+    )
+    parser.add_argument(
+        "--request-timeout-seconds",
+        default=DEFAULT_REQUEST_TIMEOUT_SECONDS,
+        type=float,
+        help="Timeout per LLM request.",
+    )
+    parser.add_argument(
+        "--production-profile",
+        default="standard",
+        choices=["light", "standard", "heavy"],
+        help="Runtime workload profile used for task 4 episodes.",
+    )
+    parser.add_argument(
+        "--business-hours-mode",
+        action="store_true",
+        help="If set, task 4 timestamps focus on business-hours windows.",
+    )
+    parser.add_argument(
+        "--escalation-mode",
+        default="normal",
+        choices=["low", "normal", "high"],
+        help="Escalation strictness for task 4 follow-up generation.",
+    )
+    return parser.parse_args()
+def validate_runtime_config(model_name: str | None) -> str:
+    """Validate required runtime settings and return effective model name."""
+    if not API_KEY:
+        raise ValueError("Missing HF_TOKEN or API_KEY environment variable.")
+    effective_model = model_name or MODEL_NAME
+    return effective_model
+def log_start(task_name: str, benchmark_name: str, model_name: str) -> None:
+    """Emit mandatory START line."""
+    print(
+        f"[START] task={task_name} env={benchmark_name} model={model_name}",
+        flush=True,
+    )
+def _format_open_score(value: float) -> str:
+    """Format scores without collapsing strict-open values to 0.00 or 1.00."""
+    clamped = max(LOG_SCORE_EPSILON, min(1.0 - LOG_SCORE_EPSILON, float(value)))
+    return f"{clamped:.6f}"
+def log_step(step: int, action_str: str, reward: float, done: bool, error: str | None) -> None:
+    """Emit mandatory STEP line."""
+    error_value = error if error else "null"
+    done_value = str(done).lower()
+    print(
+        f"[STEP] step={step} action={action_str} reward={_format_open_score(reward)} "
+        f"done={done_value} error={error_value}",
+        flush=True,
+    )
+def log_end(success: bool, steps: int, rewards: list[float]) -> None:
+    """Emit mandatory END line."""
+    rewards_str = ",".join(_format_open_score(reward) for reward in rewards)
+    print(
+        f"[END] success={str(success).lower()} steps={steps} rewards={rewards_str}",
+        flush=True,
+    )
+def build_user_prompt(observation: EmailObservation, history: list[str]) -> str:
+    """Build model prompt from current observation and recent history."""
+    recent_history = "\n".join(history[-5:]) if history else "None"
+    return (
+        f"email_id: {observation.email_id}\n"
+        f"subject: {observation.subject}\n"
+        f"sender: {observation.sender}\n"
+        f"timestamp: {observation.timestamp}\n"
+        f"body: {observation.body}\n"
+        f"thread_history: {observation.thread_history}\n"
+        f"task_id: {observation.task_id}\n"
+        f"step_number: {observation.step_number}\n"
+        f"total_emails: {observation.total_emails}\n\n"
+        f"recent_history:\n{recent_history}\n\n"
+        "Return exactly one JSON object with label, summary, route_to."
+    )
+def strip_action_prefixes(response_text: str) -> str:
+    """Remove common formatting wrappers before parsing model output."""
+    cleaned = response_text.strip()
+    cleaned = re.sub(r"^```(?:json)?", "", cleaned, flags=re.IGNORECASE).strip()
+    cleaned = re.sub(r"```$", "", cleaned).strip()
+    cleaned = re.sub(r"^(next\s+action|action)\s*:\s*", "", cleaned, flags=re.IGNORECASE)
+    return cleaned.strip()
+def parse_text_action(cleaned_text: str) -> dict[str, str]:
+    """Parse action from free-form text with deterministic regex fallback."""
+    result: dict[str, str] = {}
+    label_match = re.search(
+        r"(?:\"label\"|label)\s*[:=]\s*\"?(urgent|normal|spam|archive)\"?",
+        cleaned_text,
+        flags=re.IGNORECASE,
+    )
+    if label_match:
+        result["label"] = label_match.group(1).lower()
+    route_match = re.search(
+        r"(?:\"route_to\"|route_to|route)\s*[:=]\s*\"?([a-zA-Z0-9_\-/ ]+)\"?",
+        cleaned_text,
+        flags=re.IGNORECASE,
+    )
+    if route_match:
+        result["route_to"] = route_match.group(1).strip().lower()
+    summary_match = re.search(
+        r"(?:\"summary\"|summary)\s*[:=]\s*\"?([^\"\n]+)\"?",
+        cleaned_text,
+        flags=re.IGNORECASE,
+    )
+    if summary_match:
+        result["summary"] = summary_match.group(1).strip()
+    return result
+def parse_action_response(response_text: str) -> TriageAction:
+    """Parse model response into a valid TriageAction with fallback behavior."""
+    cleaned_text = strip_action_prefixes(response_text)
+    parsed_payload: dict[str, Any] = {}
+    json_start = cleaned_text.find("{")
+    json_end = cleaned_text.rfind("}")
+    if json_start != -1 and json_end != -1 and json_end > json_start:
+        candidate = cleaned_text[json_start : json_end + 1]
+        try:
+            loaded = json.loads(candidate)
+            if isinstance(loaded, dict):
+                parsed_payload = loaded
+        except json.JSONDecodeError:
+            parsed_payload = {}
+    if not parsed_payload:
+        parsed_payload = parse_text_action(cleaned_text)
+    fallback_copy = dict(FALLBACK_ACTION)
+    fallback_copy.update(parsed_payload)
+    try:
+        return TriageAction.model_validate(fallback_copy)
+    except Exception:
+        return TriageAction.model_validate(FALLBACK_ACTION)
+def action_to_log_string(action: TriageAction) -> str:
+    """Return single-line action string for required STEP logging."""
+    return json.dumps(action.model_dump(), separators=(",", ":"), ensure_ascii=True)
+def run_episode(
+    client: OpenAI,
+    model_name: str,
+    task_id: str,
+    scenario_index: int,
+    eval_split: str,
+    deadline: float,
+    request_timeout_seconds: float,
+    runtime_options: dict[str, Any] | None = None,
+) -> None:
+    """Run one episode and emit strict START/STEP/END lines."""
+    rewards: list[float] = []
+    steps_taken = 0
+    success = False
+    env: EmailTriageEnv | None = None
+    log_start(task_name=task_id, benchmark_name=BENCHMARK, model_name=model_name)
+    try:
+        env = EmailTriageEnv(
+            task_id=task_id,
+            scenario_index=scenario_index,
+            split=eval_split,
+            runtime_options=runtime_options,
+        )
+        reset_result = env.reset()
+        observation = reset_result.observation
+        history: list[str] = []
+        for step in range(1, MAX_STEPS + 1):
+            if time.monotonic() >= deadline:
+                break
+            prompt = build_user_prompt(observation, history)
+            response_text = ""
+            try:
+                remaining = max(1.0, deadline - time.monotonic())
+                timeout_seconds = max(
+                    1.0,
+                    min(float(request_timeout_seconds), float(remaining)),
+                )
+                completion = client.chat.completions.create(
+                    model=model_name,
+                    messages=[
+                        {"role": "system", "content": SYSTEM_PROMPT},
+                        {"role": "user", "content": prompt},
+                    ],
+                    temperature=TEMPERATURE,
+                    max_tokens=MAX_TOKENS,
+                    stream=False,
+                    timeout=timeout_seconds,
+                )
+                response_text = completion.choices[0].message.content or ""
+            except Exception:
+                response_text = ""
+            action = parse_action_response(response_text)
+            step_result = env.step(action)
+            reward = float(step_result.reward)
+            done = bool(step_result.done)
+            error_raw = step_result.info.get("validation_error")
+            error = str(error_raw) if isinstance(error_raw, str) else None
+            rewards.append(reward)
+            steps_taken = step
+            log_step(
+                step=step,
+                action_str=action_to_log_string(action),
+                reward=reward,
+                done=done,
+                error=error,
+            )
+            history.append(
+                f"step={step} action={action.label}/{action.route_to} reward={_format_open_score(reward)}"
+            )
+            observation = step_result.observation
+            if done:
+                break
+        avg_reward = sum(rewards) / max(len(rewards), 1)
+        success = avg_reward >= SUCCESS_SCORE_THRESHOLD
+    except Exception:
+        success = False
+    finally:
+        if env is not None:
+            close_method = getattr(env, "close", None)
+            if callable(close_method):
+                try:
+                    close_method()
+                except Exception:
+                    pass
+        log_end(success=success, steps=steps_taken, rewards=rewards)
+def main() -> None:
+    """Entrypoint for running one or many tasks with strict stdout logs."""
+    args = parse_args()
+    deadline = time.monotonic() + max(args.runtime_budget_seconds, 1)
+    request_timeout_seconds = max(float(args.request_timeout_seconds), 1.0)
+    try:
+        effective_model = validate_runtime_config(args.model)
+    except ValueError as error:
+        print(str(error), flush=True)
+        raise SystemExit(1) from error
+    _ = LOCAL_IMAGE_NAME
+    client = OpenAI(
+        base_url=API_BASE_URL,
+        api_key=API_KEY,
+    )
+    task_ids = [TASK_MAP[args.task]] if args.task in TASK_MAP else list(TASK_MAP.values())
+    for task_id in task_ids:
+        runtime_options = None
+        if task_id == "task_production":
+            runtime_options = {
+                "production_profile": args.production_profile,
+                "business_hours_mode": args.business_hours_mode,
+                "escalation_mode": args.escalation_mode,
+            }
+        for scenario_index in range(max(args.episodes_per_task, 1)):
+            run_episode(
+                client=client,
+                model_name=effective_model,
+                task_id=task_id,
+                scenario_index=scenario_index,
+                eval_split=args.split,
+                deadline=deadline,
+                request_timeout_seconds=request_timeout_seconds,
+                runtime_options=runtime_options,
+            )
+if __name__ == "__main__":
+    main()

server.py ADDED Viewed

	@@ -0,0 +1,775 @@

+"""Flask server wrapper for the OpenEnv email triage environment."""
+import os
+from flask import Flask, Response, jsonify, request
+from environment import EmailTriageEnv
+from tasks import get_task_scenario_count, list_task_ids
+FRONTEND_HTML = """<!doctype html>
+<html lang="en">
+<head>
+    <meta charset="utf-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1" />
+    <title>Inbox Helper Practice</title>
+    <style>
+        @import url('https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@400;600;700&family=IBM+Plex+Mono:wght@400;500&display=swap');
+        :root {
+            --bg: #f5f1e9;
+            --paper: #fffaf2;
+            --ink: #102433;
+            --accent: #ea6a2a;
+            --accent-soft: #ffd6bf;
+            --line: #d7cabb;
+            --ok: #0f7b6c;
+            --warn: #9a3a12;
+            --radius: 14px;
+        }
+        * { box-sizing: border-box; }
+        body {
+            margin: 0;
+            font-family: 'Space Grotesk', sans-serif;
+            color: var(--ink);
+            background:
+                radial-gradient(1100px 460px at -10% -20%, #f2bc9f 0%, transparent 60%),
+                radial-gradient(1100px 520px at 120% 115%, #b8d7cf 0%, transparent 62%),
+                var(--bg);
+            min-height: 100vh;
+        }
+        .wrap {
+            max-width: 1100px;
+            margin: 28px auto;
+            padding: 0 16px;
+            animation: reveal .45s ease-out;
+        }
+        @keyframes reveal {
+            from { opacity: 0; transform: translateY(10px); }
+            to { opacity: 1; transform: translateY(0); }
+        }
+        .title {
+            display: flex;
+            justify-content: space-between;
+            align-items: baseline;
+            gap: 14px;
+            margin-bottom: 14px;
+        }
+        h1 {
+            margin: 0;
+            font-size: clamp(1.5rem, 2vw, 2.2rem);
+            letter-spacing: .4px;
+        }
+        .subtitle {
+            margin: 6px 0 0;
+            font-size: .95rem;
+            opacity: .8;
+        }
+        .badge {
+            background: var(--accent-soft);
+            border: 1px solid #f2b693;
+            color: #7f2e0b;
+            padding: 6px 10px;
+            border-radius: 999px;
+            font-size: .85rem;
+            font-weight: 600;
+        }
+        .grid {
+            display: grid;
+            grid-template-columns: 1fr;
+            gap: 14px;
+        }
+        @media (min-width: 900px) {
+            .grid { grid-template-columns: 1fr 1fr; }
+            .wide { grid-column: span 2; }
+        }
+        .card {
+            background: var(--paper);
+            border: 1px solid var(--line);
+            border-radius: var(--radius);
+            padding: 14px;
+            box-shadow: 0 8px 28px rgba(16, 36, 51, 0.08);
+        }
+        .card h2 {
+            margin: 0 0 10px;
+            font-size: 1rem;
+            text-transform: uppercase;
+            letter-spacing: .08em;
+            opacity: .86;
+        }
+        .row {
+            display: flex;
+            flex-wrap: wrap;
+            gap: 8px;
+            align-items: center;
+            margin-bottom: 10px;
+        }
+        select, input, textarea, button {
+            font-family: inherit;
+            font-size: .95rem;
+        }
+        select, input, textarea {
+            width: 100%;
+            border: 1px solid #cdbba6;
+            border-radius: 10px;
+            padding: 9px 10px;
+            background: #fff;
+            color: var(--ink);
+        }
+        textarea {
+            min-height: 92px;
+            resize: vertical;
+        }
+        button {
+            border: 0;
+            border-radius: 10px;
+            padding: 9px 12px;
+            font-weight: 700;
+            background: var(--ink);
+            color: #fff;
+            cursor: pointer;
+            transition: transform .12s ease, opacity .12s ease;
+        }
+        button.secondary {
+            background: #285066;
+        }
+        button.accent {
+            background: var(--accent);
+        }
+        button:hover { transform: translateY(-1px); }
+        button:active { transform: translateY(0); opacity: .92; }
+        .status {
+            padding: 8px 10px;
+            border-radius: 10px;
+            background: #eef7f5;
+            border: 1px solid #c7e4de;
+            color: var(--ok);
+            font-weight: 600;
+            min-height: 40px;
+            display: flex;
+            align-items: center;
+        }
+        .status.error {
+            background: #fff1ea;
+            border-color: #ffc8ae;
+            color: var(--warn);
+        }
+        pre {
+            margin: 0;
+            white-space: pre-wrap;
+            background: #0f1b24;
+            color: #d9efe9;
+            border-radius: 10px;
+            padding: 12px;
+            max-height: 340px;
+            overflow: auto;
+            font-family: 'IBM Plex Mono', monospace;
+            font-size: .85rem;
+            border: 1px solid #21313f;
+        }
+        .email-block {
+            background: #fff;
+            border: 1px solid #d9ccbc;
+            border-radius: 10px;
+            padding: 12px;
+        }
+        .email-row {
+            margin-bottom: 8px;
+            font-size: .95rem;
+            line-height: 1.35;
+        }
+        .email-row strong {
+            display: inline-block;
+            min-width: 66px;
+        }
+        .help {
+            margin: 0 0 10px;
+            font-size: .9rem;
+            opacity: .8;
+        }
+        .metric {
+            display: flex;
+            justify-content: space-between;
+            align-items: center;
+            margin-bottom: 8px;
+            padding-bottom: 6px;
+            border-bottom: 1px dashed #dbcfbe;
+            font-size: .95rem;
+        }
+        .metric strong {
+            font-weight: 700;
+        }
+        .coach {
+            background: #fff7ed;
+            border: 1px solid #f2caa9;
+            border-radius: 10px;
+            padding: 10px;
+            min-height: 74px;
+            line-height: 1.4;
+            font-size: .92rem;
+        }
+        .chip-row {
+            display: flex;
+            flex-wrap: wrap;
+            gap: 8px;
+            margin-top: 10px;
+        }
+        .chip {
+            background: #eaf3ff;
+            border: 1px solid #b9d1ef;
+            color: #184469;
+            border-radius: 999px;
+            padding: 6px 10px;
+            font-size: .84rem;
+            cursor: pointer;
+            font-weight: 600;
+        }
+    </style>
+</head>
+<body>
+    <div class="wrap">
+        <div class="title">
+            <div>
+                <h1>Inbox Helper Practice</h1>
+                <p class="subtitle">Practice deciding priority, category, and who should handle each email.</p>
+            </div>
+            <span class="badge" id="badge">connecting...</span>
+        </div>
+        <div class="grid">
+            <section class="card">
+                <h2>Start a Scenario</h2>
+                <p class="help">Pick a difficulty, then click Start.</p>
+                <div class="row">
+                    <select id="taskId">
+                        <option value="task_easy">Easy: one clear email</option>
+                        <option value="task_medium">Medium: mixed inbox</option>
+                        <option value="task_hard">Hard: high-risk complaint</option>
+                        <option value="task_production">Production: full inbox simulator</option>
+                    </select>
+                </div>
+                <div id="productionControls" style="display:none;">
+                    <div class="row">
+                        <select id="productionProfile">
+                            <option value="light">Workload: Light</option>
+                            <option value="standard" selected>Workload: Standard</option>
+                            <option value="heavy">Workload: Heavy</option>
+                        </select>
+                    </div>
+                    <div class="row">
+                        <select id="businessHoursMode">
+                            <option value="false" selected>Time Profile: 24x7 inbox</option>
+                            <option value="true">Time Profile: business hours focus</option>
+                        </select>
+                    </div>
+                    <div class="row">
+                        <select id="escalationMode">
+                            <option value="low">Escalation: Low</option>
+                            <option value="normal" selected>Escalation: Normal</option>
+                            <option value="high">Escalation: High</option>
+                        </select>
+                    </div>
+                </div>
+                <div class="row">
+                    <button class="accent" id="btnReset">Start</button>
+                    <button class="secondary" id="btnState">Check Progress</button>
+                </div>
+                <div class="status" id="status">Ready. Start a scenario.</div>
+            </section>
+            <section class="card">
+                <h2>Your Decision</h2>
+                <p class="help">Choose priority, who should handle it, and a short reason.</p>
+                <div class="row">
+                    <select id="label">
+                        <option value="urgent">Urgent</option>
+                        <option value="normal" selected>Normal</option>
+                        <option value="spam">Spam</option>
+                        <option value="archive">Archive</option>
+                    </select>
+                </div>
+                <div class="row">
+                    <input id="routeTo" placeholder="Who should handle this? (billing, safety, engineering, support)" value="general" />
+                </div>
+                <div class="row">
+                    <textarea id="summary" placeholder="Write one clear sentence with key clues from the email.">Needs review.</textarea>
+                </div>
+                <div class="row">
+                    <button id="btnStep">Send Decision</button>
+                </div>
+            </section>
+            <section class="card wide">
+                <h2>Current Email</h2>
+                <div class="email-block">
+                    <div class="email-row"><strong>Subject:</strong> <span id="mailSubject">No email loaded yet.</span></div>
+                    <div class="email-row"><strong>From:</strong> <span id="mailSender">-</span></div>
+                    <div class="email-row"><strong>Message:</strong> <span id="mailBody">Start a scenario to load an email.</span></div>
+                </div>
+            </section>
+            <section class="card">
+                <h2>Live Progress</h2>
+                <div class="metric"><span>Task</span><strong id="insightTask">-</strong></div>
+                <div class="metric"><span>Scenario</span><strong id="insightScenario">-</strong></div>
+                <div class="metric"><span>Progress</span><strong id="insightProgress">0/0</strong></div>
+                <div class="metric"><span>Last Reward</span><strong id="insightReward">-</strong></div>
+                <div class="metric"><span>Base Score</span><strong id="insightBase">-</strong></div>
+            </section>
+            <section class="card">
+                <h2>Coach Notes</h2>
+                <p class="help">Use this to improve your next triage action.</p>
+                <div class="coach" id="coachNotes">Start a scenario and submit one decision to get feedback.</div>
+                <div class="chip-row">
+                    <button class="chip" id="chipSafety">Quick Fill: Urgent + Safety</button>
+                    <button class="chip" id="chipBilling">Quick Fill: Normal + Billing</button>
+                    <button class="chip" id="chipSpam">Quick Fill: Spam + General</button>
+                </div>
+            </section>
+            <section class="card wide">
+                <h2>Details (Advanced)</h2>
+                <pre id="output">Waiting for your first action...</pre>
+            </section>
+        </div>
+    </div>
+    <script>
+        const statusEl = document.getElementById('status');
+        const badgeEl = document.getElementById('badge');
+        const outEl = document.getElementById('output');
+        const mailSubjectEl = document.getElementById('mailSubject');
+        const mailSenderEl = document.getElementById('mailSender');
+        const mailBodyEl = document.getElementById('mailBody');
+        const taskIdEl = document.getElementById('taskId');
+        const productionControlsEl = document.getElementById('productionControls');
+        const insightTaskEl = document.getElementById('insightTask');
+        const insightScenarioEl = document.getElementById('insightScenario');
+        const insightProgressEl = document.getElementById('insightProgress');
+        const insightRewardEl = document.getElementById('insightReward');
+        const insightBaseEl = document.getElementById('insightBase');
+        const coachNotesEl = document.getElementById('coachNotes');
+        function setStatus(msg, isError = false) {
+            statusEl.textContent = msg;
+            statusEl.classList.toggle('error', isError);
+        }
+        function writeOutput(value) {
+            outEl.textContent = typeof value === 'string' ? value : JSON.stringify(value, null, 2);
+        }
+        function updateEmailPanel(data) {
+            if (!data || !data.observation) {
+                return;
+            }
+            const obs = data.observation;
+            mailSubjectEl.textContent = obs.subject || 'No subject';
+            mailSenderEl.textContent = obs.sender || '-';
+            mailBodyEl.textContent = obs.body || '';
+        }
+        function updateProductionControlsVisibility() {
+            const isProduction = taskIdEl.value === 'task_production';
+            productionControlsEl.style.display = isProduction ? 'block' : 'none';
+        }
+        function safeNumber(value) {
+            return typeof value === 'number' && !Number.isNaN(value) ? value : null;
+        }
+        function updateInsights(data) {
+            const info = (data && data.info) ? data.info : {};
+            const taskValue = info.task_id || data.task_id || (data.observation && data.observation.task_id) || '-';
+            const scenarioValue = info.scenario_id || '-';
+            insightTaskEl.textContent = taskValue;
+            insightScenarioEl.textContent = scenarioValue;
+            const emailsProcessed = safeNumber(info.emails_processed);
+            const emailsTotal = safeNumber(info.emails_total);
+            if (emailsProcessed !== null && emailsTotal !== null) {
+                insightProgressEl.textContent = `${emailsProcessed}/${emailsTotal}`;
+            } else if (safeNumber(data.current_step) !== null && safeNumber(data.total_steps) !== null) {
+                insightProgressEl.textContent = `${data.current_step}/${data.total_steps}`;
+            }
+            const rewardValue = safeNumber(data.reward);
+            insightRewardEl.textContent = rewardValue !== null ? rewardValue.toFixed(6) : '-';
+            const baseScoreValue = safeNumber(info.base_score);
+            insightBaseEl.textContent = baseScoreValue !== null ? baseScoreValue.toFixed(6) : '-';
+            const tips = [];
+            if (info.validation_error) {
+                tips.push('Action format is invalid. Keep label/summary/route_to filled correctly.');
+            }
+            const routeNoise = safeNumber(info.grade_route_noise_penalty);
+            if (routeNoise !== null && routeNoise > 0.01) {
+                tips.push('Route to one best owner team. Avoid sending to many teams at once.');
+            }
+            const summaryMatch = safeNumber(info.grade_summary_match);
+            if (summaryMatch !== null && summaryMatch < 0.6) {
+                tips.push('Summary is weak. Include concrete clues from subject/body/thread.');
+            }
+            const labelMatch = safeNumber(info.grade_label_match);
+            if (labelMatch !== null && labelMatch < 1.0) {
+                tips.push('Priority label may be off. Re-check urgency and risk signals.');
+            }
+            const routeMatch = safeNumber(info.grade_route_match);
+            if (routeMatch !== null && routeMatch < 1.0) {
+                tips.push('Routing looks off. Pick the team that directly owns this issue.');
+            }
+            const urgencyComponent = safeNumber(info.grade_urgency_component);
+            if (urgencyComponent !== null && urgencyComponent < 0.2) {
+                tips.push('For high-risk complaints, mark urgent and route to safety first.');
+            }
+            if (!tips.length && typeof info.grading_feedback === 'string' && info.grading_feedback) {
+                tips.push(info.grading_feedback);
+            }
+            coachNotesEl.textContent = tips.length
+                ? tips.join(' ')
+                : 'Looks good. Keep your next route precise and your summary evidence-based.';
+        }
+        function prefillAction(label, routeTo, summary) {
+            document.getElementById('label').value = label;
+            document.getElementById('routeTo').value = routeTo;
+            document.getElementById('summary').value = summary;
+        }
+        async function postJson(path, payload) {
+            const response = await fetch(path, {
+                method: 'POST',
+                headers: { 'Content-Type': 'application/json' },
+                body: JSON.stringify(payload || {}),
+            });
+            const text = await response.text();
+            let data = text;
+            try { data = JSON.parse(text); } catch (e) {}
+            if (!response.ok) {
+                throw new Error('HTTP ' + response.status + ' - ' + text);
+            }
+            return data;
+        }
+        async function warmup() {
+            try {
+                const res = await fetch('/meta');
+                const data = await res.json();
+                badgeEl.textContent = data.status === 'ok' ? 'ready' : 'check service';
+            } catch (e) {
+                badgeEl.textContent = 'offline';
+            }
+        }
+        document.getElementById('btnReset').addEventListener('click', async () => {
+            const taskId = taskIdEl.value;
+            setStatus('Starting a new scenario...');
+            try {
+                const payload = { task_id: taskId };
+                if (taskId === 'task_production') {
+                    payload.production_profile = document.getElementById('productionProfile').value;
+                    payload.business_hours_mode = document.getElementById('businessHoursMode').value === 'true';
+                    payload.escalation_mode = document.getElementById('escalationMode').value;
+                }
+                const data = await postJson('/reset', payload);
+                setStatus('Scenario started. Read the email below.');
+                updateEmailPanel(data);
+                updateInsights(data);
+                writeOutput(data);
+            } catch (e) {
+                setStatus('Could not start scenario. See details below.', true);
+                writeOutput(String(e));
+            }
+        });
+        document.getElementById('btnState').addEventListener('click', async () => {
+            setStatus('Checking progress...');
+            try {
+                const data = await postJson('/state', {});
+                setStatus('Progress updated.');
+                updateInsights(data);
+                writeOutput(data);
+            } catch (e) {
+                setStatus('Could not fetch progress. See details below.', true);
+                writeOutput(String(e));
+            }
+        });
+        document.getElementById('btnStep').addEventListener('click', async () => {
+            const payload = {
+                label: document.getElementById('label').value,
+                summary: document.getElementById('summary').value,
+                route_to: document.getElementById('routeTo').value,
+            };
+            setStatus('Sending your decision...');
+            try {
+                const data = await postJson('/step', payload);
+                setStatus('Decision saved.');
+                updateEmailPanel(data);
+                updateInsights(data);
+                writeOutput(data);
+            } catch (e) {
+                setStatus('Could not submit decision. See details below.', true);
+                writeOutput(String(e));
+            }
+        });
+        document.getElementById('chipSafety').addEventListener('click', () => {
+            prefillAction('urgent', 'safety', 'Potential safety risk with immediate escalation needed.');
+        });
+        document.getElementById('chipBilling').addEventListener('click', () => {
+            prefillAction('normal', 'billing', 'Customer billing issue needs finance team review and response.');
+        });
+        document.getElementById('chipSpam').addEventListener('click', () => {
+            prefillAction('spam', 'general', 'Likely phishing or irrelevant message with suspicious external request.');
+        });
+        taskIdEl.addEventListener('change', updateProductionControlsVisibility);
+        updateProductionControlsVisibility();
+        warmup();
+    </script>
+</body>
+</html>
+"""
+app = Flask(__name__)
+current_env = EmailTriageEnv(task_id="task_easy")
+SCENARIO_COUNTERS = {task_id: 0 for task_id in list_task_ids()}
+DEFAULT_EVAL_SPLIT = os.getenv("OPENENV_EVAL_SPLIT", "public")
+ALLOW_CLIENT_EVAL_OVERRIDE = (
+    os.getenv("OPENENV_ALLOW_CLIENT_EVAL_OVERRIDE", "false").strip().lower() == "true"
+)
+@app.get("/")
+def root_page():
+    """Render a lightweight frontend for interacting with the environment."""
+    return Response(FRONTEND_HTML, mimetype="text/html")
+@app.get("/meta")
+def root_endpoint():
+    """Return service metadata for health checks and machine clients."""
+    return jsonify(
+        {
+            "name": "email-triage-env",
+            "status": "ok",
+            "endpoints": {
+                "reset": {"method": "POST", "path": "/reset"},
+                "step": {"method": "POST", "path": "/step"},
+                "state": {"method": "POST", "path": "/state"},
+            },
+            "scenario_pools": {
+                "public": {
+                    task_id: get_task_scenario_count(task_id, "public")
+                    for task_id in list_task_ids()
+                },
+            },
+            "eval_split": DEFAULT_EVAL_SPLIT,
+            "production_runtime_controls": {
+                "production_profile": ["light", "standard", "heavy"],
+                "business_hours_mode": [True, False],
+                "escalation_mode": ["low", "normal", "high"],
+            },
+        }
+    )
+@app.post("/reset")
+def reset_endpoint():
+    """Reset the environment with a selected task and return ResetResult JSON.
+    Returns:
+        Flask response containing reset payload.
+    """
+    global current_env
+    global SCENARIO_COUNTERS
+    payload = request.get_json(silent=True)
+    if payload is None:
+        payload = {}
+    elif not isinstance(payload, dict):
+        return jsonify({"error": "Malformed JSON payload."}), 400
+    task_id = payload.get("task_id", "task_easy")
+    if not isinstance(task_id, str):
+        return jsonify({"error": "Field 'task_id' must be a string."}), 400
+    runtime_options: dict[str, object] = {}
+    if task_id == "task_production":
+        production_profile = payload.get("production_profile", "standard")
+        if not isinstance(production_profile, str) or production_profile not in {
+            "light",
+            "standard",
+            "heavy",
+        }:
+            return (
+                jsonify(
+                    {
+                        "error": (
+                            "Field 'production_profile' must be one of "
+                            "light/standard/heavy."
+                        )
+                    }
+                ),
+                400,
+            )
+        escalation_mode = payload.get("escalation_mode", "normal")
+        if not isinstance(escalation_mode, str) or escalation_mode not in {
+            "low",
+            "normal",
+            "high",
+        }:
+            return (
+                jsonify(
+                    {
+                        "error": (
+                            "Field 'escalation_mode' must be one of "
+                            "low/normal/high."
+                        )
+                    }
+                ),
+                400,
+            )
+        business_hours_mode = payload.get("business_hours_mode", False)
+        if isinstance(business_hours_mode, str):
+            business_hours_mode = business_hours_mode.strip().lower() in {
+                "1",
+                "true",
+                "yes",
+                "on",
+            }
+        elif not isinstance(business_hours_mode, bool):
+            return jsonify({"error": "Field 'business_hours_mode' must be boolean."}), 400
+        runtime_options = {
+            "production_profile": production_profile,
+            "business_hours_mode": business_hours_mode,
+            "escalation_mode": escalation_mode,
+        }
+    if not ALLOW_CLIENT_EVAL_OVERRIDE and (
+        "eval_split" in payload or "scenario_index" in payload
+    ):
+        return jsonify(
+            {
+                "error": (
+                    "Client overrides for eval_split/scenario_index are disabled "
+                    "by server policy."
+                )
+            }
+        ), 400
+    eval_split = DEFAULT_EVAL_SPLIT
+    if ALLOW_CLIENT_EVAL_OVERRIDE:
+        requested_split = payload.get("eval_split", DEFAULT_EVAL_SPLIT)
+        if not isinstance(requested_split, str):
+            return jsonify({"error": "Field 'eval_split' must be a string."}), 400
+        eval_split = requested_split
+    requested_index = payload.get("scenario_index") if ALLOW_CLIENT_EVAL_OVERRIDE else None
+    if requested_index is not None and (not isinstance(requested_index, int) or requested_index < 0):
+        return jsonify({"error": "Field 'scenario_index' must be a non-negative integer."}), 400
+    try:
+        scenario_count = get_task_scenario_count(task_id, eval_split)
+        if requested_index is None:
+            scenario_index = SCENARIO_COUNTERS.get(task_id, 0)
+            if scenario_count > 0:
+                SCENARIO_COUNTERS[task_id] = (scenario_index + 1) % scenario_count
+        else:
+            scenario_index = requested_index
+        current_env = EmailTriageEnv(
+            task_id=task_id,
+            scenario_index=scenario_index,
+            split=eval_split,
+            runtime_options=runtime_options,
+        )
+        reset_result = current_env.reset()
+    except KeyError as error:
+        return jsonify({"error": str(error)}), 400
+    return jsonify(reset_result.model_dump())
+@app.post("/step")
+def step_endpoint():
+    """Advance environment by one action and return StepResult JSON.
+    Returns:
+        Flask response containing step payload.
+    """
+    payload = request.get_json(silent=True)
+    if payload is None:
+        return jsonify({"error": "Malformed JSON payload."}), 400
+    step_result = current_env.step(payload)
+    return jsonify(step_result.model_dump())
+@app.post("/state")
+def state_endpoint():
+    """Return read-only EnvironmentState JSON snapshot.
+    Returns:
+        Flask response containing state payload.
+    """
+    state_result = current_env.state()
+    return jsonify(state_result.model_dump())
+def main() -> None:
+    """Run the Flask app for local and script-based launches."""
+    app.run(host="0.0.0.0", port=7860)
+if __name__ == "__main__":
+    main()