Imaginephoenix commited on
Commit
496c5c4
·
verified ·
1 Parent(s): 97c9151

Upload 5 files

Browse files
Files changed (5) hide show
  1. app.py +775 -0
  2. environment.py +469 -0
  3. graders.py +319 -0
  4. inference.py +384 -0
  5. server.py +775 -0
app.py ADDED
@@ -0,0 +1,775 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Auxiliary server entrypoint required by OpenEnv local validation checks."""
2
+
3
+ import os
4
+
5
+ from flask import Flask, Response, jsonify, request
6
+
7
+ from environment import EmailTriageEnv
8
+ from tasks import get_task_scenario_count, list_task_ids
9
+
10
+ FRONTEND_HTML = """<!doctype html>
11
+ <html lang="en">
12
+ <head>
13
+ <meta charset="utf-8" />
14
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
15
+ <title>Inbox Helper Practice</title>
16
+ <style>
17
+ @import url('https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@400;600;700&family=IBM+Plex+Mono:wght@400;500&display=swap');
18
+
19
+ :root {
20
+ --bg: #f5f1e9;
21
+ --paper: #fffaf2;
22
+ --ink: #102433;
23
+ --accent: #ea6a2a;
24
+ --accent-soft: #ffd6bf;
25
+ --line: #d7cabb;
26
+ --ok: #0f7b6c;
27
+ --warn: #9a3a12;
28
+ --radius: 14px;
29
+ }
30
+
31
+ * { box-sizing: border-box; }
32
+
33
+ body {
34
+ margin: 0;
35
+ font-family: 'Space Grotesk', sans-serif;
36
+ color: var(--ink);
37
+ background:
38
+ radial-gradient(1100px 460px at -10% -20%, #f2bc9f 0%, transparent 60%),
39
+ radial-gradient(1100px 520px at 120% 115%, #b8d7cf 0%, transparent 62%),
40
+ var(--bg);
41
+ min-height: 100vh;
42
+ }
43
+
44
+ .wrap {
45
+ max-width: 1100px;
46
+ margin: 28px auto;
47
+ padding: 0 16px;
48
+ animation: reveal .45s ease-out;
49
+ }
50
+
51
+ @keyframes reveal {
52
+ from { opacity: 0; transform: translateY(10px); }
53
+ to { opacity: 1; transform: translateY(0); }
54
+ }
55
+
56
+ .title {
57
+ display: flex;
58
+ justify-content: space-between;
59
+ align-items: baseline;
60
+ gap: 14px;
61
+ margin-bottom: 14px;
62
+ }
63
+
64
+ h1 {
65
+ margin: 0;
66
+ font-size: clamp(1.5rem, 2vw, 2.2rem);
67
+ letter-spacing: .4px;
68
+ }
69
+
70
+ .subtitle {
71
+ margin: 6px 0 0;
72
+ font-size: .95rem;
73
+ opacity: .8;
74
+ }
75
+
76
+ .badge {
77
+ background: var(--accent-soft);
78
+ border: 1px solid #f2b693;
79
+ color: #7f2e0b;
80
+ padding: 6px 10px;
81
+ border-radius: 999px;
82
+ font-size: .85rem;
83
+ font-weight: 600;
84
+ }
85
+
86
+ .grid {
87
+ display: grid;
88
+ grid-template-columns: 1fr;
89
+ gap: 14px;
90
+ }
91
+
92
+ @media (min-width: 900px) {
93
+ .grid { grid-template-columns: 1fr 1fr; }
94
+ .wide { grid-column: span 2; }
95
+ }
96
+
97
+ .card {
98
+ background: var(--paper);
99
+ border: 1px solid var(--line);
100
+ border-radius: var(--radius);
101
+ padding: 14px;
102
+ box-shadow: 0 8px 28px rgba(16, 36, 51, 0.08);
103
+ }
104
+
105
+ .card h2 {
106
+ margin: 0 0 10px;
107
+ font-size: 1rem;
108
+ text-transform: uppercase;
109
+ letter-spacing: .08em;
110
+ opacity: .86;
111
+ }
112
+
113
+ .row {
114
+ display: flex;
115
+ flex-wrap: wrap;
116
+ gap: 8px;
117
+ align-items: center;
118
+ margin-bottom: 10px;
119
+ }
120
+
121
+ select, input, textarea, button {
122
+ font-family: inherit;
123
+ font-size: .95rem;
124
+ }
125
+
126
+ select, input, textarea {
127
+ width: 100%;
128
+ border: 1px solid #cdbba6;
129
+ border-radius: 10px;
130
+ padding: 9px 10px;
131
+ background: #fff;
132
+ color: var(--ink);
133
+ }
134
+
135
+ textarea {
136
+ min-height: 92px;
137
+ resize: vertical;
138
+ }
139
+
140
+ button {
141
+ border: 0;
142
+ border-radius: 10px;
143
+ padding: 9px 12px;
144
+ font-weight: 700;
145
+ background: var(--ink);
146
+ color: #fff;
147
+ cursor: pointer;
148
+ transition: transform .12s ease, opacity .12s ease;
149
+ }
150
+
151
+ button.secondary {
152
+ background: #285066;
153
+ }
154
+
155
+ button.accent {
156
+ background: var(--accent);
157
+ }
158
+
159
+ button:hover { transform: translateY(-1px); }
160
+ button:active { transform: translateY(0); opacity: .92; }
161
+
162
+ .status {
163
+ padding: 8px 10px;
164
+ border-radius: 10px;
165
+ background: #eef7f5;
166
+ border: 1px solid #c7e4de;
167
+ color: var(--ok);
168
+ font-weight: 600;
169
+ min-height: 40px;
170
+ display: flex;
171
+ align-items: center;
172
+ }
173
+
174
+ .status.error {
175
+ background: #fff1ea;
176
+ border-color: #ffc8ae;
177
+ color: var(--warn);
178
+ }
179
+
180
+ pre {
181
+ margin: 0;
182
+ white-space: pre-wrap;
183
+ background: #0f1b24;
184
+ color: #d9efe9;
185
+ border-radius: 10px;
186
+ padding: 12px;
187
+ max-height: 340px;
188
+ overflow: auto;
189
+ font-family: 'IBM Plex Mono', monospace;
190
+ font-size: .85rem;
191
+ border: 1px solid #21313f;
192
+ }
193
+
194
+ .email-block {
195
+ background: #fff;
196
+ border: 1px solid #d9ccbc;
197
+ border-radius: 10px;
198
+ padding: 12px;
199
+ }
200
+
201
+ .email-row {
202
+ margin-bottom: 8px;
203
+ font-size: .95rem;
204
+ line-height: 1.35;
205
+ }
206
+
207
+ .email-row strong {
208
+ display: inline-block;
209
+ min-width: 66px;
210
+ }
211
+
212
+ .help {
213
+ margin: 0 0 10px;
214
+ font-size: .9rem;
215
+ opacity: .8;
216
+ }
217
+
218
+ .metric {
219
+ display: flex;
220
+ justify-content: space-between;
221
+ align-items: center;
222
+ margin-bottom: 8px;
223
+ padding-bottom: 6px;
224
+ border-bottom: 1px dashed #dbcfbe;
225
+ font-size: .95rem;
226
+ }
227
+
228
+ .metric strong {
229
+ font-weight: 700;
230
+ }
231
+
232
+ .coach {
233
+ background: #fff7ed;
234
+ border: 1px solid #f2caa9;
235
+ border-radius: 10px;
236
+ padding: 10px;
237
+ min-height: 74px;
238
+ line-height: 1.4;
239
+ font-size: .92rem;
240
+ }
241
+
242
+ .chip-row {
243
+ display: flex;
244
+ flex-wrap: wrap;
245
+ gap: 8px;
246
+ margin-top: 10px;
247
+ }
248
+
249
+ .chip {
250
+ background: #eaf3ff;
251
+ border: 1px solid #b9d1ef;
252
+ color: #184469;
253
+ border-radius: 999px;
254
+ padding: 6px 10px;
255
+ font-size: .84rem;
256
+ cursor: pointer;
257
+ font-weight: 600;
258
+ }
259
+ </style>
260
+ </head>
261
+ <body>
262
+ <div class="wrap">
263
+ <div class="title">
264
+ <div>
265
+ <h1>Inbox Helper Practice</h1>
266
+ <p class="subtitle">Practice deciding priority, category, and who should handle each email.</p>
267
+ </div>
268
+ <span class="badge" id="badge">connecting...</span>
269
+ </div>
270
+
271
+ <div class="grid">
272
+ <section class="card">
273
+ <h2>Start a Scenario</h2>
274
+ <p class="help">Pick a difficulty, then click Start.</p>
275
+ <div class="row">
276
+ <select id="taskId">
277
+ <option value="task_easy">Easy: one clear email</option>
278
+ <option value="task_medium">Medium: mixed inbox</option>
279
+ <option value="task_hard">Hard: high-risk complaint</option>
280
+ <option value="task_production">Production: full inbox simulator</option>
281
+ </select>
282
+ </div>
283
+ <div id="productionControls" style="display:none;">
284
+ <div class="row">
285
+ <select id="productionProfile">
286
+ <option value="light">Workload: Light</option>
287
+ <option value="standard" selected>Workload: Standard</option>
288
+ <option value="heavy">Workload: Heavy</option>
289
+ </select>
290
+ </div>
291
+ <div class="row">
292
+ <select id="businessHoursMode">
293
+ <option value="false" selected>Time Profile: 24x7 inbox</option>
294
+ <option value="true">Time Profile: business hours focus</option>
295
+ </select>
296
+ </div>
297
+ <div class="row">
298
+ <select id="escalationMode">
299
+ <option value="low">Escalation: Low</option>
300
+ <option value="normal" selected>Escalation: Normal</option>
301
+ <option value="high">Escalation: High</option>
302
+ </select>
303
+ </div>
304
+ </div>
305
+ <div class="row">
306
+ <button class="accent" id="btnReset">Start</button>
307
+ <button class="secondary" id="btnState">Check Progress</button>
308
+ </div>
309
+ <div class="status" id="status">Ready. Start a scenario.</div>
310
+ </section>
311
+
312
+ <section class="card">
313
+ <h2>Your Decision</h2>
314
+ <p class="help">Choose priority, who should handle it, and a short reason.</p>
315
+ <div class="row">
316
+ <select id="label">
317
+ <option value="urgent">Urgent</option>
318
+ <option value="normal" selected>Normal</option>
319
+ <option value="spam">Spam</option>
320
+ <option value="archive">Archive</option>
321
+ </select>
322
+ </div>
323
+ <div class="row">
324
+ <input id="routeTo" placeholder="Who should handle this? (billing, safety, engineering, support)" value="general" />
325
+ </div>
326
+ <div class="row">
327
+ <textarea id="summary" placeholder="Write one clear sentence with key clues from the email.">Needs review.</textarea>
328
+ </div>
329
+ <div class="row">
330
+ <button id="btnStep">Send Decision</button>
331
+ </div>
332
+ </section>
333
+
334
+ <section class="card wide">
335
+ <h2>Current Email</h2>
336
+ <div class="email-block">
337
+ <div class="email-row"><strong>Subject:</strong> <span id="mailSubject">No email loaded yet.</span></div>
338
+ <div class="email-row"><strong>From:</strong> <span id="mailSender">-</span></div>
339
+ <div class="email-row"><strong>Message:</strong> <span id="mailBody">Start a scenario to load an email.</span></div>
340
+ </div>
341
+ </section>
342
+
343
+ <section class="card">
344
+ <h2>Live Progress</h2>
345
+ <div class="metric"><span>Task</span><strong id="insightTask">-</strong></div>
346
+ <div class="metric"><span>Scenario</span><strong id="insightScenario">-</strong></div>
347
+ <div class="metric"><span>Progress</span><strong id="insightProgress">0/0</strong></div>
348
+ <div class="metric"><span>Last Reward</span><strong id="insightReward">-</strong></div>
349
+ <div class="metric"><span>Base Score</span><strong id="insightBase">-</strong></div>
350
+ </section>
351
+
352
+ <section class="card">
353
+ <h2>Coach Notes</h2>
354
+ <p class="help">Use this to improve your next triage action.</p>
355
+ <div class="coach" id="coachNotes">Start a scenario and submit one decision to get feedback.</div>
356
+ <div class="chip-row">
357
+ <button class="chip" id="chipSafety">Quick Fill: Urgent + Safety</button>
358
+ <button class="chip" id="chipBilling">Quick Fill: Normal + Billing</button>
359
+ <button class="chip" id="chipSpam">Quick Fill: Spam + General</button>
360
+ </div>
361
+ </section>
362
+
363
+ <section class="card wide">
364
+ <h2>Details (Advanced)</h2>
365
+ <pre id="output">Waiting for your first action...</pre>
366
+ </section>
367
+ </div>
368
+ </div>
369
+
370
+ <script>
371
+ const statusEl = document.getElementById('status');
372
+ const badgeEl = document.getElementById('badge');
373
+ const outEl = document.getElementById('output');
374
+ const mailSubjectEl = document.getElementById('mailSubject');
375
+ const mailSenderEl = document.getElementById('mailSender');
376
+ const mailBodyEl = document.getElementById('mailBody');
377
+ const taskIdEl = document.getElementById('taskId');
378
+ const productionControlsEl = document.getElementById('productionControls');
379
+ const insightTaskEl = document.getElementById('insightTask');
380
+ const insightScenarioEl = document.getElementById('insightScenario');
381
+ const insightProgressEl = document.getElementById('insightProgress');
382
+ const insightRewardEl = document.getElementById('insightReward');
383
+ const insightBaseEl = document.getElementById('insightBase');
384
+ const coachNotesEl = document.getElementById('coachNotes');
385
+
386
+ function setStatus(msg, isError = false) {
387
+ statusEl.textContent = msg;
388
+ statusEl.classList.toggle('error', isError);
389
+ }
390
+
391
+ function writeOutput(value) {
392
+ outEl.textContent = typeof value === 'string' ? value : JSON.stringify(value, null, 2);
393
+ }
394
+
395
+ function updateEmailPanel(data) {
396
+ if (!data || !data.observation) {
397
+ return;
398
+ }
399
+ const obs = data.observation;
400
+ mailSubjectEl.textContent = obs.subject || 'No subject';
401
+ mailSenderEl.textContent = obs.sender || '-';
402
+ mailBodyEl.textContent = obs.body || '';
403
+ }
404
+
405
+ function updateProductionControlsVisibility() {
406
+ const isProduction = taskIdEl.value === 'task_production';
407
+ productionControlsEl.style.display = isProduction ? 'block' : 'none';
408
+ }
409
+
410
+ function safeNumber(value) {
411
+ return typeof value === 'number' && !Number.isNaN(value) ? value : null;
412
+ }
413
+
414
+ function updateInsights(data) {
415
+ const info = (data && data.info) ? data.info : {};
416
+ const taskValue = info.task_id || data.task_id || (data.observation && data.observation.task_id) || '-';
417
+ const scenarioValue = info.scenario_id || '-';
418
+
419
+ insightTaskEl.textContent = taskValue;
420
+ insightScenarioEl.textContent = scenarioValue;
421
+
422
+ const emailsProcessed = safeNumber(info.emails_processed);
423
+ const emailsTotal = safeNumber(info.emails_total);
424
+ if (emailsProcessed !== null && emailsTotal !== null) {
425
+ insightProgressEl.textContent = `${emailsProcessed}/${emailsTotal}`;
426
+ } else if (safeNumber(data.current_step) !== null && safeNumber(data.total_steps) !== null) {
427
+ insightProgressEl.textContent = `${data.current_step}/${data.total_steps}`;
428
+ }
429
+
430
+ const rewardValue = safeNumber(data.reward);
431
+ insightRewardEl.textContent = rewardValue !== null ? rewardValue.toFixed(6) : '-';
432
+
433
+ const baseScoreValue = safeNumber(info.base_score);
434
+ insightBaseEl.textContent = baseScoreValue !== null ? baseScoreValue.toFixed(6) : '-';
435
+
436
+ const tips = [];
437
+ if (info.validation_error) {
438
+ tips.push('Action format is invalid. Keep label/summary/route_to filled correctly.');
439
+ }
440
+
441
+ const routeNoise = safeNumber(info.grade_route_noise_penalty);
442
+ if (routeNoise !== null && routeNoise > 0.01) {
443
+ tips.push('Route to one best owner team. Avoid sending to many teams at once.');
444
+ }
445
+
446
+ const summaryMatch = safeNumber(info.grade_summary_match);
447
+ if (summaryMatch !== null && summaryMatch < 0.6) {
448
+ tips.push('Summary is weak. Include concrete clues from subject/body/thread.');
449
+ }
450
+
451
+ const labelMatch = safeNumber(info.grade_label_match);
452
+ if (labelMatch !== null && labelMatch < 1.0) {
453
+ tips.push('Priority label may be off. Re-check urgency and risk signals.');
454
+ }
455
+
456
+ const routeMatch = safeNumber(info.grade_route_match);
457
+ if (routeMatch !== null && routeMatch < 1.0) {
458
+ tips.push('Routing looks off. Pick the team that directly owns this issue.');
459
+ }
460
+
461
+ const urgencyComponent = safeNumber(info.grade_urgency_component);
462
+ if (urgencyComponent !== null && urgencyComponent < 0.2) {
463
+ tips.push('For high-risk complaints, mark urgent and route to safety first.');
464
+ }
465
+
466
+ if (!tips.length && typeof info.grading_feedback === 'string' && info.grading_feedback) {
467
+ tips.push(info.grading_feedback);
468
+ }
469
+
470
+ coachNotesEl.textContent = tips.length
471
+ ? tips.join(' ')
472
+ : 'Looks good. Keep your next route precise and your summary evidence-based.';
473
+ }
474
+
475
+ function prefillAction(label, routeTo, summary) {
476
+ document.getElementById('label').value = label;
477
+ document.getElementById('routeTo').value = routeTo;
478
+ document.getElementById('summary').value = summary;
479
+ }
480
+
481
+ async function postJson(path, payload) {
482
+ const response = await fetch(path, {
483
+ method: 'POST',
484
+ headers: { 'Content-Type': 'application/json' },
485
+ body: JSON.stringify(payload || {}),
486
+ });
487
+ const text = await response.text();
488
+ let data = text;
489
+ try { data = JSON.parse(text); } catch (e) {}
490
+ if (!response.ok) {
491
+ throw new Error('HTTP ' + response.status + ' - ' + text);
492
+ }
493
+ return data;
494
+ }
495
+
496
+ async function warmup() {
497
+ try {
498
+ const res = await fetch('/meta');
499
+ const data = await res.json();
500
+ badgeEl.textContent = data.status === 'ok' ? 'ready' : 'check service';
501
+ } catch (e) {
502
+ badgeEl.textContent = 'offline';
503
+ }
504
+ }
505
+
506
+ document.getElementById('btnReset').addEventListener('click', async () => {
507
+ const taskId = taskIdEl.value;
508
+ setStatus('Starting a new scenario...');
509
+ try {
510
+ const payload = { task_id: taskId };
511
+ if (taskId === 'task_production') {
512
+ payload.production_profile = document.getElementById('productionProfile').value;
513
+ payload.business_hours_mode = document.getElementById('businessHoursMode').value === 'true';
514
+ payload.escalation_mode = document.getElementById('escalationMode').value;
515
+ }
516
+ const data = await postJson('/reset', payload);
517
+ setStatus('Scenario started. Read the email below.');
518
+ updateEmailPanel(data);
519
+ updateInsights(data);
520
+ writeOutput(data);
521
+ } catch (e) {
522
+ setStatus('Could not start scenario. See details below.', true);
523
+ writeOutput(String(e));
524
+ }
525
+ });
526
+
527
+ document.getElementById('btnState').addEventListener('click', async () => {
528
+ setStatus('Checking progress...');
529
+ try {
530
+ const data = await postJson('/state', {});
531
+ setStatus('Progress updated.');
532
+ updateInsights(data);
533
+ writeOutput(data);
534
+ } catch (e) {
535
+ setStatus('Could not fetch progress. See details below.', true);
536
+ writeOutput(String(e));
537
+ }
538
+ });
539
+
540
+ document.getElementById('btnStep').addEventListener('click', async () => {
541
+ const payload = {
542
+ label: document.getElementById('label').value,
543
+ summary: document.getElementById('summary').value,
544
+ route_to: document.getElementById('routeTo').value,
545
+ };
546
+ setStatus('Sending your decision...');
547
+ try {
548
+ const data = await postJson('/step', payload);
549
+ setStatus('Decision saved.');
550
+ updateEmailPanel(data);
551
+ updateInsights(data);
552
+ writeOutput(data);
553
+ } catch (e) {
554
+ setStatus('Could not submit decision. See details below.', true);
555
+ writeOutput(String(e));
556
+ }
557
+ });
558
+
559
+ document.getElementById('chipSafety').addEventListener('click', () => {
560
+ prefillAction('urgent', 'safety', 'Potential safety risk with immediate escalation needed.');
561
+ });
562
+
563
+ document.getElementById('chipBilling').addEventListener('click', () => {
564
+ prefillAction('normal', 'billing', 'Customer billing issue needs finance team review and response.');
565
+ });
566
+
567
+ document.getElementById('chipSpam').addEventListener('click', () => {
568
+ prefillAction('spam', 'general', 'Likely phishing or irrelevant message with suspicious external request.');
569
+ });
570
+
571
+ taskIdEl.addEventListener('change', updateProductionControlsVisibility);
572
+
573
+ updateProductionControlsVisibility();
574
+ warmup();
575
+ </script>
576
+ </body>
577
+ </html>
578
+ """
579
+
580
+ app = Flask(__name__)
581
+ current_env = EmailTriageEnv(task_id="task_easy")
582
+ SCENARIO_COUNTERS = {task_id: 0 for task_id in list_task_ids()}
583
+ DEFAULT_EVAL_SPLIT = os.getenv("OPENENV_EVAL_SPLIT", "public")
584
+ ALLOW_CLIENT_EVAL_OVERRIDE = (
585
+ os.getenv("OPENENV_ALLOW_CLIENT_EVAL_OVERRIDE", "false").strip().lower() == "true"
586
+ )
587
+
588
+
589
+ @app.get("/")
590
+ def root_page():
591
+ """Render a lightweight frontend for interacting with the environment."""
592
+ return Response(FRONTEND_HTML, mimetype="text/html")
593
+
594
+
595
+ @app.get("/meta")
596
+ def root_endpoint():
597
+ """Return service metadata for health checks and machine clients."""
598
+ return jsonify(
599
+ {
600
+ "name": "email-triage-env",
601
+ "status": "ok",
602
+ "endpoints": {
603
+ "reset": {"method": "POST", "path": "/reset"},
604
+ "step": {"method": "POST", "path": "/step"},
605
+ "state": {"method": "POST", "path": "/state"},
606
+ },
607
+ "scenario_pools": {
608
+ "public": {
609
+ task_id: get_task_scenario_count(task_id, "public")
610
+ for task_id in list_task_ids()
611
+ },
612
+ },
613
+ "eval_split": DEFAULT_EVAL_SPLIT,
614
+ "production_runtime_controls": {
615
+ "production_profile": ["light", "standard", "heavy"],
616
+ "business_hours_mode": [True, False],
617
+ "escalation_mode": ["low", "normal", "high"],
618
+ },
619
+ }
620
+ )
621
+
622
+
623
+ @app.post("/reset")
624
+ def reset_endpoint():
625
+ """Reset the environment with a selected task and return ResetResult JSON.
626
+
627
+ Returns:
628
+ Flask response containing reset payload.
629
+ """
630
+ global current_env
631
+ global SCENARIO_COUNTERS
632
+
633
+ payload = request.get_json(silent=True)
634
+ if payload is None:
635
+ payload = {}
636
+ elif not isinstance(payload, dict):
637
+ return jsonify({"error": "Malformed JSON payload."}), 400
638
+
639
+ task_id = payload.get("task_id", "task_easy")
640
+ if not isinstance(task_id, str):
641
+ return jsonify({"error": "Field 'task_id' must be a string."}), 400
642
+
643
+ runtime_options: dict[str, object] = {}
644
+ if task_id == "task_production":
645
+ production_profile = payload.get("production_profile", "standard")
646
+ if not isinstance(production_profile, str) or production_profile not in {
647
+ "light",
648
+ "standard",
649
+ "heavy",
650
+ }:
651
+ return (
652
+ jsonify(
653
+ {
654
+ "error": (
655
+ "Field 'production_profile' must be one of "
656
+ "light/standard/heavy."
657
+ )
658
+ }
659
+ ),
660
+ 400,
661
+ )
662
+
663
+ escalation_mode = payload.get("escalation_mode", "normal")
664
+ if not isinstance(escalation_mode, str) or escalation_mode not in {
665
+ "low",
666
+ "normal",
667
+ "high",
668
+ }:
669
+ return (
670
+ jsonify(
671
+ {
672
+ "error": (
673
+ "Field 'escalation_mode' must be one of "
674
+ "low/normal/high."
675
+ )
676
+ }
677
+ ),
678
+ 400,
679
+ )
680
+
681
+ business_hours_mode = payload.get("business_hours_mode", False)
682
+ if isinstance(business_hours_mode, str):
683
+ business_hours_mode = business_hours_mode.strip().lower() in {
684
+ "1",
685
+ "true",
686
+ "yes",
687
+ "on",
688
+ }
689
+ elif not isinstance(business_hours_mode, bool):
690
+ return jsonify({"error": "Field 'business_hours_mode' must be boolean."}), 400
691
+
692
+ runtime_options = {
693
+ "production_profile": production_profile,
694
+ "business_hours_mode": business_hours_mode,
695
+ "escalation_mode": escalation_mode,
696
+ }
697
+
698
+ if not ALLOW_CLIENT_EVAL_OVERRIDE and (
699
+ "eval_split" in payload or "scenario_index" in payload
700
+ ):
701
+ return jsonify(
702
+ {
703
+ "error": (
704
+ "Client overrides for eval_split/scenario_index are disabled "
705
+ "by server policy."
706
+ )
707
+ }
708
+ ), 400
709
+
710
+ eval_split = DEFAULT_EVAL_SPLIT
711
+ if ALLOW_CLIENT_EVAL_OVERRIDE:
712
+ requested_split = payload.get("eval_split", DEFAULT_EVAL_SPLIT)
713
+ if not isinstance(requested_split, str):
714
+ return jsonify({"error": "Field 'eval_split' must be a string."}), 400
715
+ eval_split = requested_split
716
+
717
+ requested_index = payload.get("scenario_index") if ALLOW_CLIENT_EVAL_OVERRIDE else None
718
+ if requested_index is not None and (not isinstance(requested_index, int) or requested_index < 0):
719
+ return jsonify({"error": "Field 'scenario_index' must be a non-negative integer."}), 400
720
+
721
+ try:
722
+ scenario_count = get_task_scenario_count(task_id, eval_split)
723
+ if requested_index is None:
724
+ scenario_index = SCENARIO_COUNTERS.get(task_id, 0)
725
+ if scenario_count > 0:
726
+ SCENARIO_COUNTERS[task_id] = (scenario_index + 1) % scenario_count
727
+ else:
728
+ scenario_index = requested_index
729
+
730
+ current_env = EmailTriageEnv(
731
+ task_id=task_id,
732
+ scenario_index=scenario_index,
733
+ split=eval_split,
734
+ runtime_options=runtime_options,
735
+ )
736
+ reset_result = current_env.reset()
737
+ except KeyError as error:
738
+ return jsonify({"error": str(error)}), 400
739
+
740
+ return jsonify(reset_result.model_dump())
741
+
742
+
743
+ @app.post("/step")
744
+ def step_endpoint():
745
+ """Advance environment by one action and return StepResult JSON.
746
+
747
+ Returns:
748
+ Flask response containing step payload.
749
+ """
750
+ payload = request.get_json(silent=True)
751
+ if payload is None:
752
+ return jsonify({"error": "Malformed JSON payload."}), 400
753
+
754
+ step_result = current_env.step(payload)
755
+ return jsonify(step_result.model_dump())
756
+
757
+
758
+ @app.post("/state")
759
+ def state_endpoint():
760
+ """Return read-only EnvironmentState JSON snapshot.
761
+
762
+ Returns:
763
+ Flask response containing state payload.
764
+ """
765
+ state_result = current_env.state()
766
+ return jsonify(state_result.model_dump())
767
+
768
+
769
+ def main() -> None:
770
+ """Run the Flask app for local and script-based launches."""
771
+ app.run(host="0.0.0.0", port=7860)
772
+
773
+
774
+ if __name__ == "__main__":
775
+ main()
environment.py ADDED
@@ -0,0 +1,469 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Core OpenEnv email triage environment implementation."""
2
+
3
+ import os
4
+ from typing import cast
5
+
6
+ from pydantic import ValidationError
7
+
8
+ from graders import SCORE_EPSILON, grade_easy, grade_hard, grade_medium_step
9
+ from models import (
10
+ EmailObservation,
11
+ EnvironmentState,
12
+ ResetResult,
13
+ RewardResult,
14
+ StepResult,
15
+ TriageAction,
16
+ )
17
+ from tasks import get_task_definition
18
+
19
+
20
+ class EmailTriageEnv:
21
+ """Deterministic email triage environment implementing reset, step, and state."""
22
+
23
+ def __init__(
24
+ self,
25
+ task_id: str,
26
+ scenario_index: int = 0,
27
+ split: str | None = None,
28
+ runtime_options: dict[str, object] | None = None,
29
+ ) -> None:
30
+ """Initialize environment with a selected task.
31
+
32
+ Args:
33
+ task_id: Task identifier such as task_easy, task_medium, or task_hard.
34
+ scenario_index: Deterministic scenario index within the task pool.
35
+ split: Scenario split, either public or private_eval.
36
+ runtime_options: Optional deterministic runtime controls for task generation.
37
+ """
38
+ self.task_id = task_id
39
+ self._episode_index = max(0, scenario_index)
40
+ self.split = split or os.getenv("OPENENV_EVAL_SPLIT", "public")
41
+ self.runtime_options = runtime_options or {}
42
+ self._task_definition = get_task_definition(
43
+ task_id,
44
+ self._episode_index,
45
+ self.split,
46
+ self.runtime_options,
47
+ )
48
+ self._scenario_id = str(self._task_definition.get("scenario_id", "unknown"))
49
+ self._emails = cast(list[dict[str, object]], self._task_definition.get("emails", []))
50
+ self._ground_truth = cast(
51
+ list[dict[str, object]], self._task_definition.get("ground_truth", [])
52
+ )
53
+
54
+ self._current_index = 0
55
+ self._current_step = 0
56
+ self._done = False
57
+ self._max_steps = max(10, len(self._emails) + 5)
58
+ self._action_history: list[TriageAction] = []
59
+ self._reward_history: list[float] = []
60
+ self._base_score_history: list[float] = []
61
+ self._generated_followups = 0
62
+ self._max_generated_followups = 4
63
+ self._followup_quality_threshold = 0.7
64
+ self._configure_runtime_controls()
65
+
66
+ def reset(self) -> ResetResult:
67
+ """Reset episode state and return the first observation.
68
+
69
+ Returns:
70
+ ResetResult containing first observation and metadata.
71
+ """
72
+ self._task_definition = get_task_definition(
73
+ self.task_id,
74
+ self._episode_index,
75
+ self.split,
76
+ self.runtime_options,
77
+ )
78
+ self._scenario_id = str(self._task_definition.get("scenario_id", "unknown"))
79
+ self._emails = cast(list[dict[str, object]], self._task_definition.get("emails", []))
80
+ self._ground_truth = cast(
81
+ list[dict[str, object]], self._task_definition.get("ground_truth", [])
82
+ )
83
+
84
+ self._current_index = 0
85
+ self._current_step = 0
86
+ self._done = False
87
+ self._max_steps = max(10, len(self._emails) + 5)
88
+ self._action_history = []
89
+ self._reward_history = []
90
+ self._base_score_history = []
91
+ self._generated_followups = 0
92
+ self._configure_runtime_controls()
93
+ self._episode_index += 1
94
+
95
+ first_observation = self._build_observation(self._current_index)
96
+ return ResetResult(
97
+ observation=first_observation,
98
+ info={
99
+ "task_id": self.task_id,
100
+ "scenario_id": self._scenario_id,
101
+ "split": self.split,
102
+ "step": self._current_step,
103
+ "emails_total": len(self._emails),
104
+ "task_description": str(self._task_definition.get("description", "")),
105
+ },
106
+ )
107
+
108
+ def step(self, action: TriageAction) -> StepResult:
109
+ """Apply an action and return StepResult.
110
+
111
+ Args:
112
+ action: Proposed triage action.
113
+
114
+ Returns:
115
+ StepResult with next observation, reward, done flag, and metadata.
116
+ """
117
+ if self._done:
118
+ return StepResult(
119
+ observation=self._terminal_observation(),
120
+ reward=SCORE_EPSILON,
121
+ done=True,
122
+ info={
123
+ "task_id": self.task_id,
124
+ "scenario_id": self._scenario_id,
125
+ "split": self.split,
126
+ "step": self._current_step,
127
+ "already_done": True,
128
+ },
129
+ )
130
+
131
+ try:
132
+ validated_action = TriageAction.model_validate(action)
133
+ except ValidationError as validation_error:
134
+ self._current_step += 1
135
+ self._reward_history.append(SCORE_EPSILON)
136
+ self._done = self._current_step >= self._max_steps
137
+ return StepResult(
138
+ observation=self._build_observation(self._current_index),
139
+ reward=SCORE_EPSILON,
140
+ done=self._done,
141
+ info={
142
+ "task_id": self.task_id,
143
+ "scenario_id": self._scenario_id,
144
+ "split": self.split,
145
+ "step": self._current_step,
146
+ "emails_total": len(self._emails),
147
+ "emails_processed": self._current_index,
148
+ "emails_remaining": max(len(self._emails) - self._current_index, 0),
149
+ "validation_error": str(validation_error),
150
+ },
151
+ )
152
+
153
+ base_result = self._grade_current_step(validated_action)
154
+ base_score = base_result.score
155
+ previous_base_score = self._base_score_history[-1] if self._base_score_history else None
156
+ progress_signal = self._compute_progress_signal(base_score, previous_base_score)
157
+
158
+ truth_for_step = (
159
+ self._ground_truth[min(self._current_index, len(self._ground_truth) - 1)]
160
+ if self._ground_truth
161
+ else {}
162
+ )
163
+ self._maybe_enqueue_follow_up(validated_action, truth_for_step, base_score)
164
+
165
+ self._action_history.append(validated_action)
166
+ self._base_score_history.append(base_score)
167
+ self._current_step += 1
168
+
169
+ penalties = self._compute_penalties(validated_action)
170
+ trajectory_bonus = self._compute_trajectory_bonus()
171
+ step_cost = self._compute_step_cost()
172
+ final_reward = self._clip_reward(
173
+ base_score + progress_signal + trajectory_bonus - penalties - step_cost
174
+ )
175
+
176
+ self._reward_history.append(final_reward)
177
+
178
+ if self._current_index < len(self._emails):
179
+ self._current_index += 1
180
+
181
+ all_emails_processed = self._current_index >= len(self._emails)
182
+ self._done = all_emails_processed or self._current_step >= self._max_steps
183
+
184
+ next_observation = (
185
+ self._terminal_observation()
186
+ if self._done
187
+ else self._build_observation(self._current_index)
188
+ )
189
+
190
+ info = {
191
+ "task_id": self.task_id,
192
+ "scenario_id": self._scenario_id,
193
+ "split": self.split,
194
+ "step": self._current_step,
195
+ "emails_total": len(self._emails),
196
+ "emails_processed": min(self._current_index, len(self._emails)),
197
+ "emails_remaining": max(len(self._emails) - self._current_index, 0),
198
+ "base_score": float(base_score),
199
+ "progress_signal": float(progress_signal),
200
+ "step_cost": float(step_cost),
201
+ "penalties": float(penalties),
202
+ "trajectory_bonus": float(trajectory_bonus),
203
+ "grading_feedback": base_result.feedback,
204
+ }
205
+ for breakdown_key, breakdown_value in base_result.breakdown.items():
206
+ if isinstance(breakdown_value, (int, float)):
207
+ info[f"grade_{breakdown_key}"] = float(breakdown_value)
208
+
209
+ return StepResult(
210
+ observation=next_observation,
211
+ reward=final_reward,
212
+ done=self._done,
213
+ info=info,
214
+ )
215
+
216
+ def _maybe_enqueue_follow_up(
217
+ self,
218
+ action: TriageAction,
219
+ truth: dict[str, object],
220
+ base_score: float,
221
+ ) -> None:
222
+ """Insert deterministic escalation follow-up emails for production mode."""
223
+ if self.task_id != "task_production":
224
+ return
225
+ if self._generated_followups >= self._max_generated_followups:
226
+ return
227
+ if not self._emails:
228
+ return
229
+
230
+ expected_label = str(truth.get("label", ""))
231
+ expected_route = str(truth.get("route_to", "general"))
232
+ is_missed_critical = (
233
+ expected_label == "urgent"
234
+ and (action.label != "urgent" or expected_route not in action.route_to.lower())
235
+ )
236
+ if not is_missed_critical and base_score >= self._followup_quality_threshold:
237
+ return
238
+
239
+ source_email = self._emails[min(self._current_index, len(self._emails) - 1)]
240
+ source_subject = str(source_email.get("subject", "Inbox incident"))
241
+ source_timestamp = str(source_email.get("timestamp", "2026-04-03T00:00:00Z"))
242
+
243
+ followup_email = {
244
+ "email_id": f"followup-{self._scenario_id}-{self._generated_followups + 1}",
245
+ "subject": f"Escalation follow-up: {source_subject}",
246
+ "body": (
247
+ "Automated escalation triggered because prior triage appears incomplete. "
248
+ "Please route to the responsible team and provide a clear summary now."
249
+ ),
250
+ "sender": "incident-control@acme-enterprise.com",
251
+ "timestamp": source_timestamp,
252
+ "thread_history": [f"Previous message subject: {source_subject}"],
253
+ }
254
+ followup_truth = {
255
+ "label": "urgent",
256
+ "route_to": expected_route,
257
+ "priority_weight": min(max(float(truth.get("priority_weight", 1.5)) + 0.2, 1.5), 2.0),
258
+ "summary_keywords": ["escalation", "follow-up", expected_route],
259
+ }
260
+
261
+ insert_at = min(self._current_index + 1, len(self._emails))
262
+ self._emails.insert(insert_at, followup_email)
263
+ self._ground_truth.insert(insert_at, followup_truth)
264
+ self._generated_followups += 1
265
+
266
+ def _configure_runtime_controls(self) -> None:
267
+ """Apply deterministic runtime control options for production simulator."""
268
+ if self.task_id != "task_production":
269
+ self._max_generated_followups = 4
270
+ self._followup_quality_threshold = 0.7
271
+ return
272
+
273
+ escalation_mode = str(self.runtime_options.get("escalation_mode", "normal")).lower()
274
+ escalation_map = {
275
+ "low": (2, 0.55),
276
+ "normal": (4, 0.7),
277
+ "high": (8, 0.85),
278
+ }
279
+ max_followups, threshold = escalation_map.get(escalation_mode, escalation_map["normal"])
280
+ self._max_generated_followups = max_followups
281
+ self._followup_quality_threshold = threshold
282
+
283
+ def state(self) -> EnvironmentState:
284
+ """Return read-only snapshot of full internal state.
285
+
286
+ Returns:
287
+ EnvironmentState with progress and history.
288
+ """
289
+ return EnvironmentState(
290
+ task_id=self.task_id,
291
+ current_step=self._current_step,
292
+ total_steps=self._max_steps,
293
+ done=self._done,
294
+ action_history=list(self._action_history),
295
+ reward_history=list(self._reward_history),
296
+ )
297
+
298
+ def _build_observation(self, email_index: int) -> EmailObservation:
299
+ """Build observation for the email at a given index.
300
+
301
+ Args:
302
+ email_index: Zero-based email index.
303
+
304
+ Returns:
305
+ EmailObservation for the selected email or terminal placeholder.
306
+ """
307
+ if not self._emails:
308
+ return self._terminal_observation()
309
+
310
+ safe_index = min(max(email_index, 0), len(self._emails) - 1)
311
+ email_payload = self._emails[safe_index]
312
+
313
+ return EmailObservation(
314
+ email_id=str(email_payload.get("email_id", "")),
315
+ subject=str(email_payload.get("subject", "")),
316
+ body=str(email_payload.get("body", "")),
317
+ sender=str(email_payload.get("sender", "")),
318
+ timestamp=str(email_payload.get("timestamp", "")),
319
+ thread_history=[str(item) for item in email_payload.get("thread_history", [])],
320
+ task_id=self.task_id,
321
+ step_number=self._current_step,
322
+ total_emails=len(self._emails),
323
+ )
324
+
325
+ def _terminal_observation(self) -> EmailObservation:
326
+ """Build terminal observation returned when episode is complete.
327
+
328
+ Returns:
329
+ Terminal EmailObservation payload.
330
+ """
331
+ return EmailObservation(
332
+ email_id="terminal",
333
+ subject="Episode complete",
334
+ body="No further emails remain for this task.",
335
+ sender="system",
336
+ timestamp="",
337
+ thread_history=[],
338
+ task_id=self.task_id,
339
+ step_number=self._current_step,
340
+ total_emails=len(self._emails),
341
+ )
342
+
343
+ def _grade_current_step(self, action: TriageAction) -> RewardResult:
344
+ """Select deterministic grader based on task and current progress.
345
+
346
+ Args:
347
+ action: Validated action for the current step.
348
+
349
+ Returns:
350
+ RewardResult from task-specific grader.
351
+ """
352
+ if not self._ground_truth:
353
+ return RewardResult(
354
+ score=SCORE_EPSILON,
355
+ breakdown={"missing_ground_truth": 1.0 - SCORE_EPSILON},
356
+ feedback="Missing ground truth for task.",
357
+ )
358
+
359
+ if self.task_id == "task_easy":
360
+ truth = self._ground_truth[min(self._current_index, len(self._ground_truth) - 1)]
361
+ return grade_easy(action, truth)
362
+
363
+ if self.task_id == "task_medium":
364
+ truth = self._ground_truth[min(self._current_index, len(self._ground_truth) - 1)]
365
+ return grade_medium_step(action, truth)
366
+
367
+ truth = self._ground_truth[min(self._current_index, len(self._ground_truth) - 1)]
368
+ return grade_hard(action, truth)
369
+
370
+ def _compute_penalties(self, action: TriageAction) -> float:
371
+ """Compute deterministic penalties according to reward policy.
372
+
373
+ Args:
374
+ action: Validated action for the step.
375
+
376
+ Returns:
377
+ Total penalty value for current step.
378
+ """
379
+ penalty_total = 0.0
380
+
381
+ summary_too_short = len(action.summary.strip()) < 10
382
+ if action.label == "archive" and summary_too_short:
383
+ penalty_total += 0.5
384
+
385
+ if self._is_repeated_action_pattern(action):
386
+ penalty_total += 0.3
387
+
388
+ return penalty_total
389
+
390
+ def _compute_progress_signal(
391
+ self,
392
+ base_score: float,
393
+ previous_base_score: float | None,
394
+ ) -> float:
395
+ """Compute dense partial-progress reward independent of final completion.
396
+
397
+ Args:
398
+ base_score: Current-step base grade in [0.0, 1.0].
399
+ previous_base_score: Previous step base grade when available.
400
+
401
+ Returns:
402
+ Small positive/negative signal reflecting progress and quality trend.
403
+ """
404
+ total_emails = max(len(self._emails), 1)
405
+ progress_ratio = min(1.0, (self._current_index + 1) / total_emails)
406
+
407
+ completion_signal = 0.05 * progress_ratio
408
+ quality_signal = 0.05 * self._clip_reward(base_score)
409
+
410
+ trend_signal = 0.0
411
+ if previous_base_score is not None:
412
+ delta = base_score - previous_base_score
413
+ trend_signal = max(-0.02, min(0.03, delta * 0.1))
414
+
415
+ return completion_signal + quality_signal + trend_signal
416
+
417
+ def _compute_step_cost(self) -> float:
418
+ """Return a gentle efficiency cost that grows with episode length."""
419
+ normalized_step = self._current_step / max(self._max_steps, 1)
420
+ return 0.005 + (0.01 * normalized_step)
421
+
422
+ def _compute_trajectory_bonus(self) -> float:
423
+ """Return trajectory bonus when episode completion quality is high.
424
+
425
+ Returns:
426
+ 0.2 when mean base score is above threshold at completion, else 0.0.
427
+ """
428
+ if not self._base_score_history:
429
+ return 0.0
430
+
431
+ all_emails_done_after_step = self._current_index + 1 >= len(self._emails)
432
+ if not all_emails_done_after_step:
433
+ return 0.0
434
+
435
+ mean_base = sum(self._base_score_history) / len(self._base_score_history)
436
+ return 0.2 if mean_base > 0.8 else 0.0
437
+
438
+ def _is_repeated_action_pattern(self, action: TriageAction) -> bool:
439
+ """Detect whether same action appears three times consecutively.
440
+
441
+ Args:
442
+ action: Current action.
443
+
444
+ Returns:
445
+ True when repeated label and route occur three times in a row.
446
+ """
447
+ if len(self._action_history) < 2:
448
+ return False
449
+
450
+ previous_action = self._action_history[-1]
451
+ older_action = self._action_history[-2]
452
+
453
+ return (
454
+ previous_action.label == older_action.label == action.label
455
+ and previous_action.route_to.strip().lower()
456
+ == older_action.route_to.strip().lower()
457
+ == action.route_to.strip().lower()
458
+ )
459
+
460
+ def _clip_reward(self, reward_value: float) -> float:
461
+ """Clip reward to the strict range (0.0, 1.0).
462
+
463
+ Args:
464
+ reward_value: Raw reward value.
465
+
466
+ Returns:
467
+ Clipped reward.
468
+ """
469
+ return max(SCORE_EPSILON, min(1.0 - SCORE_EPSILON, reward_value))
graders.py ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Deterministic graders for OpenEnv email triage tasks."""
2
+
3
+ import re
4
+
5
+ from models import RewardResult, TriageAction
6
+
7
+ ROUTE_ALIAS_MAP = {
8
+ "billing": ["billing", "finance", "payments", "accounts"],
9
+ "safety": ["safety", "compliance", "risk"],
10
+ "engineering": ["engineering", "eng", "sre", "platform", "on-call"],
11
+ "support": ["support", "helpdesk", "customer support"],
12
+ "general": ["general", "inbox", "operations"],
13
+ }
14
+
15
+ SCORE_EPSILON = 1e-6
16
+
17
+
18
+ def _strict_binary_score(is_positive_case: bool) -> float:
19
+ """Return strict in-range score for binary outcomes."""
20
+ return 1.0 - SCORE_EPSILON if is_positive_case else SCORE_EPSILON
21
+
22
+
23
+ def _strict_ratio_score(raw_value: float) -> float:
24
+ """Return strict in-range score for ratio-like metrics."""
25
+ return _clip_score(raw_value)
26
+
27
+
28
+ def _clip_score(score_value: float) -> float:
29
+ """Clip a score to the strict range (0.0, 1.0).
30
+
31
+ Args:
32
+ score_value: Raw score.
33
+
34
+ Returns:
35
+ Clipped score.
36
+ """
37
+ clipped = max(0.0, min(1.0, score_value))
38
+ if clipped <= 0.0:
39
+ return SCORE_EPSILON
40
+ if clipped >= 1.0:
41
+ return 1.0 - SCORE_EPSILON
42
+ return clipped
43
+
44
+
45
+ def _normalized_text(text_value: str) -> str:
46
+ """Return normalized lowercase text for deterministic comparisons.
47
+
48
+ Args:
49
+ text_value: Input text.
50
+
51
+ Returns:
52
+ Normalized text.
53
+ """
54
+ return text_value.strip().lower()
55
+
56
+
57
+ def _route_matches(action_route: str, expected_route: str) -> bool:
58
+ """Check if action route contains the expected route token.
59
+
60
+ Args:
61
+ action_route: Route provided by agent.
62
+ expected_route: Route expected by ground truth.
63
+
64
+ Returns:
65
+ True when expected route is present in the action route.
66
+ """
67
+ normalized_expected = _normalized_text(expected_route)
68
+ if not normalized_expected:
69
+ return False
70
+
71
+ return normalized_expected in _canonical_route_tokens(action_route)
72
+
73
+
74
+ def _canonical_route_tokens(action_route: str) -> set[str]:
75
+ """Map free-form route text to canonical route categories."""
76
+ normalized_action = _normalized_text(action_route)
77
+ if not normalized_action:
78
+ return set()
79
+
80
+ route_fragments = [
81
+ fragment.strip()
82
+ for fragment in re.split(r"[,;/|]+", normalized_action)
83
+ if fragment.strip()
84
+ ]
85
+
86
+ canonical: set[str] = set()
87
+ for fragment in route_fragments:
88
+ for route_name, aliases in ROUTE_ALIAS_MAP.items():
89
+ if any(alias in fragment for alias in aliases):
90
+ canonical.add(route_name)
91
+ break
92
+
93
+ # Fallback for phrases without separators.
94
+ if not canonical:
95
+ for route_name, aliases in ROUTE_ALIAS_MAP.items():
96
+ if any(alias in normalized_action for alias in aliases):
97
+ canonical.add(route_name)
98
+
99
+ return canonical
100
+
101
+
102
+ def _route_noise_penalty(action_route: str) -> float:
103
+ """Penalize over-routing to many teams in one action."""
104
+ route_count = len(_canonical_route_tokens(action_route))
105
+ if route_count <= 2:
106
+ return 0.0
107
+ return min(0.24, 0.08 * (route_count - 2))
108
+
109
+
110
+ def _summary_keyword_score(summary_text: str, ground_truth: dict) -> float:
111
+ """Score summary quality using deterministic keyword overlap.
112
+
113
+ Args:
114
+ summary_text: Summary text produced by the agent.
115
+ ground_truth: Ground-truth dict that may include summary keywords.
116
+
117
+ Returns:
118
+ Score in [0.0, 1.0] based on matched summary keywords.
119
+ """
120
+ raw_keywords = ground_truth.get("summary_keywords", [])
121
+ if not isinstance(raw_keywords, list):
122
+ return _strict_binary_score(len(summary_text.strip()) >= 10)
123
+
124
+ keywords = [
125
+ _normalized_text(str(keyword))
126
+ for keyword in raw_keywords
127
+ if _normalized_text(str(keyword))
128
+ ]
129
+ if not keywords:
130
+ return _strict_binary_score(len(summary_text.strip()) >= 10)
131
+
132
+ normalized_summary = _normalized_text(summary_text)
133
+ matches = 0
134
+ for keyword in keywords:
135
+ if keyword in normalized_summary:
136
+ matches += 1
137
+
138
+ base_score = matches / len(keywords)
139
+
140
+ # Discourage keyword stuffing and overly verbose summaries.
141
+ word_count = len(re.findall(r"[a-z0-9'-]+", normalized_summary))
142
+ if word_count < 4:
143
+ brevity_factor = 0.6
144
+ elif word_count <= 40:
145
+ brevity_factor = 1.0
146
+ else:
147
+ brevity_factor = max(0.45, 1.0 - (word_count - 40) * 0.02)
148
+
149
+ list_like_penalty = 0.85 if normalized_summary.count(",") >= 6 and matches >= 3 else 1.0
150
+ return _clip_score(base_score * brevity_factor * list_like_penalty)
151
+
152
+
153
+ def grade_easy(action: TriageAction, ground_truth: dict) -> RewardResult:
154
+ """Grade easy task with deterministic partial credit.
155
+
156
+ Args:
157
+ action: Agent action for one email.
158
+ ground_truth: Expected label and route.
159
+
160
+ Returns:
161
+ Deterministic reward result in [0.0, 1.0].
162
+ """
163
+ expected_label = _normalized_text(str(ground_truth.get("label", "")))
164
+ expected_route = _normalized_text(str(ground_truth.get("route_to", "")))
165
+
166
+ label_correct = _normalized_text(action.label) == expected_label
167
+ route_correct = _route_matches(action.route_to, expected_route)
168
+ summary_score = _summary_keyword_score(action.summary, ground_truth)
169
+ noise_penalty = _route_noise_penalty(action.route_to)
170
+
171
+ score_value = (0.6 if label_correct else 0.0) + (0.25 if route_correct else 0.0)
172
+ score_value += 0.15 * summary_score
173
+ score_value -= noise_penalty
174
+
175
+ score_value = _clip_score(score_value)
176
+ breakdown = {
177
+ "label_match": _strict_binary_score(label_correct),
178
+ "route_match": _strict_binary_score(route_correct),
179
+ "summary_match": _strict_ratio_score(summary_score),
180
+ "route_noise_penalty": _strict_ratio_score(noise_penalty),
181
+ }
182
+ feedback = "Easy-task grading completed with context summary scoring."
183
+ return RewardResult(score=score_value, breakdown=breakdown, feedback=feedback)
184
+
185
+
186
+ def grade_medium_step(action: TriageAction, truth: dict) -> RewardResult:
187
+ """Grade one medium-task step without cumulative history effects."""
188
+ expected_label = _normalized_text(str(truth.get("label", "")))
189
+ expected_route = _normalized_text(str(truth.get("route_to", "")))
190
+ priority_weight = max(float(truth.get("priority_weight", 1.0)), 0.1)
191
+
192
+ label_correct = _normalized_text(action.label) == expected_label
193
+ route_correct = _route_matches(action.route_to, expected_route)
194
+ summary_score = _summary_keyword_score(action.summary, truth)
195
+ noise_penalty = _route_noise_penalty(action.route_to)
196
+
197
+ per_email_score = (0.55 if label_correct else 0.0) + (0.3 if route_correct else 0.0)
198
+ per_email_score += 0.15 * summary_score
199
+ per_email_score -= noise_penalty
200
+ per_email_score = _clip_score(per_email_score)
201
+
202
+ weighted_step_score = _clip_score(per_email_score * min(priority_weight, 2.0))
203
+
204
+ return RewardResult(
205
+ score=weighted_step_score,
206
+ breakdown={
207
+ "label_match": _strict_binary_score(label_correct),
208
+ "route_match": _strict_binary_score(route_correct),
209
+ "summary_match": _strict_ratio_score(summary_score),
210
+ "priority_weight": _strict_ratio_score(min(priority_weight / 2.0, 1.0)),
211
+ "route_noise_penalty": _strict_ratio_score(noise_penalty),
212
+ },
213
+ feedback="Medium-task step grading completed.",
214
+ )
215
+
216
+
217
+ def grade_medium(actions: list[TriageAction], ground_truths: list[dict]) -> RewardResult:
218
+ """Grade medium task using weighted per-email partial scoring.
219
+
220
+ Args:
221
+ actions: Agent actions for the medium task email queue.
222
+ ground_truths: Expected action details for each email.
223
+
224
+ Returns:
225
+ Deterministic reward result in [0.0, 1.0].
226
+ """
227
+ comparable_count = min(len(actions), len(ground_truths))
228
+ if comparable_count == 0:
229
+ return RewardResult(
230
+ score=SCORE_EPSILON,
231
+ breakdown={"emails_scored": SCORE_EPSILON, "weighted_average": SCORE_EPSILON},
232
+ feedback="No actions available for grading.",
233
+ )
234
+
235
+ weighted_score_sum = 0.0
236
+ weight_sum = 0.0
237
+ label_hits = 0
238
+ route_hits = 0
239
+ summary_total = 0.0
240
+ noise_penalty_total = 0.0
241
+
242
+ for index in range(comparable_count):
243
+ action = actions[index]
244
+ truth = ground_truths[index]
245
+
246
+ step_result = grade_medium_step(action, truth)
247
+ priority_weight = max(float(truth.get("priority_weight", 1.0)), 0.1)
248
+ weighted_score_sum += step_result.score
249
+ weight_sum += min(priority_weight, 2.0)
250
+
251
+ expected_label = _normalized_text(str(truth.get("label", "")))
252
+ expected_route = _normalized_text(str(truth.get("route_to", "")))
253
+ label_hits += 1 if _normalized_text(action.label) == expected_label else 0
254
+ route_hits += 1 if _route_matches(action.route_to, expected_route) else 0
255
+ summary_total += float(step_result.breakdown.get("summary_match", SCORE_EPSILON))
256
+ noise_penalty_total += float(
257
+ step_result.breakdown.get("route_noise_penalty", SCORE_EPSILON)
258
+ )
259
+
260
+ weighted_average = weighted_score_sum / weight_sum if weight_sum > 0.0 else 0.0
261
+ score_value = _clip_score(weighted_average)
262
+
263
+ breakdown = {
264
+ "emails_scored": _strict_ratio_score(float(comparable_count) / (comparable_count + 1.0)),
265
+ "label_accuracy": _strict_ratio_score(label_hits / comparable_count),
266
+ "route_accuracy": _strict_ratio_score(route_hits / comparable_count),
267
+ "summary_accuracy": _strict_ratio_score(summary_total / comparable_count),
268
+ "avg_route_noise_penalty": _strict_ratio_score(noise_penalty_total / comparable_count),
269
+ "weighted_average": score_value,
270
+ }
271
+ feedback = "Weighted medium-task grading completed."
272
+ return RewardResult(score=score_value, breakdown=breakdown, feedback=feedback)
273
+
274
+
275
+ def grade_hard(action: TriageAction, ground_truth: dict) -> RewardResult:
276
+ """Grade hard task using weighted policy-sensitive components.
277
+
278
+ Args:
279
+ action: Agent action for hard task case.
280
+ ground_truth: Expected routing and urgency intent.
281
+
282
+ Returns:
283
+ Deterministic reward result in [0.0, 1.0].
284
+ """
285
+ expected_label = _normalized_text(str(ground_truth.get("label", "urgent")))
286
+ primary_route = _normalized_text(str(ground_truth.get("route_to", "safety")))
287
+ secondary_route = _normalized_text(str(ground_truth.get("cc_route", "billing")))
288
+ spam_penalty = float(ground_truth.get("penalize_spam", 0.2))
289
+
290
+ normalized_route = _normalized_text(action.route_to)
291
+ has_primary_route = _route_matches(normalized_route, primary_route)
292
+ has_secondary_route = _route_matches(normalized_route, secondary_route)
293
+ urgent_label = _normalized_text(action.label) == expected_label
294
+ summary_score = _summary_keyword_score(action.summary, ground_truth)
295
+ noise_penalty = _route_noise_penalty(action.route_to)
296
+
297
+ escalation_component = 0.35 if has_primary_route else 0.0
298
+ routing_component = 0.25 if has_secondary_route else 0.0
299
+ urgency_component = 0.25 if urgent_label else 0.0
300
+ summary_component = 0.15 * summary_score
301
+
302
+ raw_score = escalation_component + routing_component + urgency_component + summary_component
303
+ raw_score -= noise_penalty
304
+ if _normalized_text(action.label) == "spam":
305
+ raw_score -= spam_penalty
306
+
307
+ score_value = _clip_score(raw_score)
308
+ breakdown = {
309
+ "escalation_component": _strict_ratio_score(escalation_component),
310
+ "routing_component": _strict_ratio_score(routing_component),
311
+ "urgency_component": _strict_ratio_score(urgency_component),
312
+ "summary_component": _strict_ratio_score(summary_component),
313
+ "route_noise_penalty": _strict_ratio_score(noise_penalty),
314
+ "spam_penalty": _strict_ratio_score(
315
+ spam_penalty if _normalized_text(action.label) == "spam" else SCORE_EPSILON
316
+ ),
317
+ }
318
+ feedback = "Hard-task weighted policy grading completed."
319
+ return RewardResult(score=score_value, breakdown=breakdown, feedback=feedback)
inference.py ADDED
@@ -0,0 +1,384 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Inference script for OpenEnv email triage with strict stdout event format."""
2
+
3
+ import argparse
4
+ import json
5
+ import os
6
+ import re
7
+ import time
8
+ from typing import Any
9
+
10
+ from openai import OpenAI
11
+
12
+ from environment import EmailTriageEnv
13
+ from models import EmailObservation, TriageAction
14
+
15
+ API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
16
+ MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
17
+ HF_TOKEN = os.getenv("HF_TOKEN")
18
+ API_KEY = HF_TOKEN or os.getenv("API_KEY")
19
+ LOCAL_IMAGE_NAME = os.getenv("LOCAL_IMAGE_NAME")
20
+
21
+ BENCHMARK = "openenv-email-triage"
22
+ MAX_STEPS = 30
23
+ TEMPERATURE = 0.2
24
+ MAX_TOKENS = 200
25
+ SUCCESS_SCORE_THRESHOLD = 0.5
26
+ LOG_SCORE_EPSILON = 1e-6
27
+ DEFAULT_RUNTIME_BUDGET_SECONDS = int(os.getenv("INFERENCE_RUNTIME_BUDGET_SECONDS", "1140"))
28
+ DEFAULT_REQUEST_TIMEOUT_SECONDS = float(os.getenv("INFERENCE_REQUEST_TIMEOUT_SECONDS", "12"))
29
+
30
+ SYSTEM_PROMPT = (
31
+ "You are an email triage assistant. For each email, prioritize risk/time impact, "
32
+ "categorize with one label (urgent|normal|spam|archive), route to the best team, "
33
+ "and summarize the key evidence. Return one JSON object with keys label, summary, route_to."
34
+ )
35
+
36
+ FALLBACK_ACTION = {
37
+ "label": "normal",
38
+ "summary": "Unable to parse response",
39
+ "route_to": "general",
40
+ }
41
+
42
+ TASK_MAP = {
43
+ "1": "task_easy",
44
+ "2": "task_medium",
45
+ "3": "task_hard",
46
+ "4": "task_production",
47
+ }
48
+
49
+
50
+ def parse_args() -> argparse.Namespace:
51
+ """Parse command-line arguments for task and optional model override."""
52
+ parser = argparse.ArgumentParser(description="Run OpenEnv email triage inference.")
53
+ parser.add_argument(
54
+ "--task",
55
+ default="all",
56
+ choices=["1", "2", "3", "4", "all"],
57
+ help="Task selection: 1, 2, 3, 4, or all.",
58
+ )
59
+ parser.add_argument(
60
+ "--model",
61
+ default=None,
62
+ help="Optional model override. Falls back to MODEL_NAME environment variable.",
63
+ )
64
+ parser.add_argument(
65
+ "--split",
66
+ default=os.getenv("OPENENV_EVAL_SPLIT", "public"),
67
+ choices=["public", "private_eval"],
68
+ help="Scenario split to evaluate.",
69
+ )
70
+ parser.add_argument(
71
+ "--episodes-per-task",
72
+ default=1,
73
+ type=int,
74
+ help="Number of deterministic scenarios to evaluate per task.",
75
+ )
76
+ parser.add_argument(
77
+ "--runtime-budget-seconds",
78
+ default=DEFAULT_RUNTIME_BUDGET_SECONDS,
79
+ type=int,
80
+ help="Global wall-clock budget for the full script run.",
81
+ )
82
+ parser.add_argument(
83
+ "--request-timeout-seconds",
84
+ default=DEFAULT_REQUEST_TIMEOUT_SECONDS,
85
+ type=float,
86
+ help="Timeout per LLM request.",
87
+ )
88
+ parser.add_argument(
89
+ "--production-profile",
90
+ default="standard",
91
+ choices=["light", "standard", "heavy"],
92
+ help="Runtime workload profile used for task 4 episodes.",
93
+ )
94
+ parser.add_argument(
95
+ "--business-hours-mode",
96
+ action="store_true",
97
+ help="If set, task 4 timestamps focus on business-hours windows.",
98
+ )
99
+ parser.add_argument(
100
+ "--escalation-mode",
101
+ default="normal",
102
+ choices=["low", "normal", "high"],
103
+ help="Escalation strictness for task 4 follow-up generation.",
104
+ )
105
+ return parser.parse_args()
106
+
107
+
108
+ def validate_runtime_config(model_name: str | None) -> str:
109
+ """Validate required runtime settings and return effective model name."""
110
+ if not API_KEY:
111
+ raise ValueError("Missing HF_TOKEN or API_KEY environment variable.")
112
+
113
+ effective_model = model_name or MODEL_NAME
114
+ return effective_model
115
+
116
+
117
+ def log_start(task_name: str, benchmark_name: str, model_name: str) -> None:
118
+ """Emit mandatory START line."""
119
+ print(
120
+ f"[START] task={task_name} env={benchmark_name} model={model_name}",
121
+ flush=True,
122
+ )
123
+
124
+
125
+ def _format_open_score(value: float) -> str:
126
+ """Format scores without collapsing strict-open values to 0.00 or 1.00."""
127
+ clamped = max(LOG_SCORE_EPSILON, min(1.0 - LOG_SCORE_EPSILON, float(value)))
128
+ return f"{clamped:.6f}"
129
+
130
+
131
+ def log_step(step: int, action_str: str, reward: float, done: bool, error: str | None) -> None:
132
+ """Emit mandatory STEP line."""
133
+ error_value = error if error else "null"
134
+ done_value = str(done).lower()
135
+ print(
136
+ f"[STEP] step={step} action={action_str} reward={_format_open_score(reward)} "
137
+ f"done={done_value} error={error_value}",
138
+ flush=True,
139
+ )
140
+
141
+
142
+ def log_end(success: bool, steps: int, rewards: list[float]) -> None:
143
+ """Emit mandatory END line."""
144
+ rewards_str = ",".join(_format_open_score(reward) for reward in rewards)
145
+ print(
146
+ f"[END] success={str(success).lower()} steps={steps} rewards={rewards_str}",
147
+ flush=True,
148
+ )
149
+
150
+
151
+ def build_user_prompt(observation: EmailObservation, history: list[str]) -> str:
152
+ """Build model prompt from current observation and recent history."""
153
+ recent_history = "\n".join(history[-5:]) if history else "None"
154
+ return (
155
+ f"email_id: {observation.email_id}\n"
156
+ f"subject: {observation.subject}\n"
157
+ f"sender: {observation.sender}\n"
158
+ f"timestamp: {observation.timestamp}\n"
159
+ f"body: {observation.body}\n"
160
+ f"thread_history: {observation.thread_history}\n"
161
+ f"task_id: {observation.task_id}\n"
162
+ f"step_number: {observation.step_number}\n"
163
+ f"total_emails: {observation.total_emails}\n\n"
164
+ f"recent_history:\n{recent_history}\n\n"
165
+ "Return exactly one JSON object with label, summary, route_to."
166
+ )
167
+
168
+
169
+ def strip_action_prefixes(response_text: str) -> str:
170
+ """Remove common formatting wrappers before parsing model output."""
171
+ cleaned = response_text.strip()
172
+ cleaned = re.sub(r"^```(?:json)?", "", cleaned, flags=re.IGNORECASE).strip()
173
+ cleaned = re.sub(r"```$", "", cleaned).strip()
174
+ cleaned = re.sub(r"^(next\s+action|action)\s*:\s*", "", cleaned, flags=re.IGNORECASE)
175
+ return cleaned.strip()
176
+
177
+
178
+ def parse_text_action(cleaned_text: str) -> dict[str, str]:
179
+ """Parse action from free-form text with deterministic regex fallback."""
180
+ result: dict[str, str] = {}
181
+
182
+ label_match = re.search(
183
+ r"(?:\"label\"|label)\s*[:=]\s*\"?(urgent|normal|spam|archive)\"?",
184
+ cleaned_text,
185
+ flags=re.IGNORECASE,
186
+ )
187
+ if label_match:
188
+ result["label"] = label_match.group(1).lower()
189
+
190
+ route_match = re.search(
191
+ r"(?:\"route_to\"|route_to|route)\s*[:=]\s*\"?([a-zA-Z0-9_\-/ ]+)\"?",
192
+ cleaned_text,
193
+ flags=re.IGNORECASE,
194
+ )
195
+ if route_match:
196
+ result["route_to"] = route_match.group(1).strip().lower()
197
+
198
+ summary_match = re.search(
199
+ r"(?:\"summary\"|summary)\s*[:=]\s*\"?([^\"\n]+)\"?",
200
+ cleaned_text,
201
+ flags=re.IGNORECASE,
202
+ )
203
+ if summary_match:
204
+ result["summary"] = summary_match.group(1).strip()
205
+
206
+ return result
207
+
208
+
209
+ def parse_action_response(response_text: str) -> TriageAction:
210
+ """Parse model response into a valid TriageAction with fallback behavior."""
211
+ cleaned_text = strip_action_prefixes(response_text)
212
+ parsed_payload: dict[str, Any] = {}
213
+
214
+ json_start = cleaned_text.find("{")
215
+ json_end = cleaned_text.rfind("}")
216
+ if json_start != -1 and json_end != -1 and json_end > json_start:
217
+ candidate = cleaned_text[json_start : json_end + 1]
218
+ try:
219
+ loaded = json.loads(candidate)
220
+ if isinstance(loaded, dict):
221
+ parsed_payload = loaded
222
+ except json.JSONDecodeError:
223
+ parsed_payload = {}
224
+
225
+ if not parsed_payload:
226
+ parsed_payload = parse_text_action(cleaned_text)
227
+
228
+ fallback_copy = dict(FALLBACK_ACTION)
229
+ fallback_copy.update(parsed_payload)
230
+
231
+ try:
232
+ return TriageAction.model_validate(fallback_copy)
233
+ except Exception:
234
+ return TriageAction.model_validate(FALLBACK_ACTION)
235
+
236
+
237
+ def action_to_log_string(action: TriageAction) -> str:
238
+ """Return single-line action string for required STEP logging."""
239
+ return json.dumps(action.model_dump(), separators=(",", ":"), ensure_ascii=True)
240
+
241
+
242
+ def run_episode(
243
+ client: OpenAI,
244
+ model_name: str,
245
+ task_id: str,
246
+ scenario_index: int,
247
+ eval_split: str,
248
+ deadline: float,
249
+ request_timeout_seconds: float,
250
+ runtime_options: dict[str, Any] | None = None,
251
+ ) -> None:
252
+ """Run one episode and emit strict START/STEP/END lines."""
253
+ rewards: list[float] = []
254
+ steps_taken = 0
255
+ success = False
256
+ env: EmailTriageEnv | None = None
257
+
258
+ log_start(task_name=task_id, benchmark_name=BENCHMARK, model_name=model_name)
259
+
260
+ try:
261
+ env = EmailTriageEnv(
262
+ task_id=task_id,
263
+ scenario_index=scenario_index,
264
+ split=eval_split,
265
+ runtime_options=runtime_options,
266
+ )
267
+ reset_result = env.reset()
268
+ observation = reset_result.observation
269
+ history: list[str] = []
270
+
271
+ for step in range(1, MAX_STEPS + 1):
272
+ if time.monotonic() >= deadline:
273
+ break
274
+
275
+ prompt = build_user_prompt(observation, history)
276
+
277
+ response_text = ""
278
+ try:
279
+ remaining = max(1.0, deadline - time.monotonic())
280
+ timeout_seconds = max(
281
+ 1.0,
282
+ min(float(request_timeout_seconds), float(remaining)),
283
+ )
284
+ completion = client.chat.completions.create(
285
+ model=model_name,
286
+ messages=[
287
+ {"role": "system", "content": SYSTEM_PROMPT},
288
+ {"role": "user", "content": prompt},
289
+ ],
290
+ temperature=TEMPERATURE,
291
+ max_tokens=MAX_TOKENS,
292
+ stream=False,
293
+ timeout=timeout_seconds,
294
+ )
295
+ response_text = completion.choices[0].message.content or ""
296
+ except Exception:
297
+ response_text = ""
298
+
299
+ action = parse_action_response(response_text)
300
+ step_result = env.step(action)
301
+
302
+ reward = float(step_result.reward)
303
+ done = bool(step_result.done)
304
+ error_raw = step_result.info.get("validation_error")
305
+ error = str(error_raw) if isinstance(error_raw, str) else None
306
+
307
+ rewards.append(reward)
308
+ steps_taken = step
309
+
310
+ log_step(
311
+ step=step,
312
+ action_str=action_to_log_string(action),
313
+ reward=reward,
314
+ done=done,
315
+ error=error,
316
+ )
317
+
318
+ history.append(
319
+ f"step={step} action={action.label}/{action.route_to} reward={_format_open_score(reward)}"
320
+ )
321
+ observation = step_result.observation
322
+
323
+ if done:
324
+ break
325
+
326
+ avg_reward = sum(rewards) / max(len(rewards), 1)
327
+ success = avg_reward >= SUCCESS_SCORE_THRESHOLD
328
+ except Exception:
329
+ success = False
330
+ finally:
331
+ if env is not None:
332
+ close_method = getattr(env, "close", None)
333
+ if callable(close_method):
334
+ try:
335
+ close_method()
336
+ except Exception:
337
+ pass
338
+
339
+ log_end(success=success, steps=steps_taken, rewards=rewards)
340
+
341
+
342
+ def main() -> None:
343
+ """Entrypoint for running one or many tasks with strict stdout logs."""
344
+ args = parse_args()
345
+ deadline = time.monotonic() + max(args.runtime_budget_seconds, 1)
346
+ request_timeout_seconds = max(float(args.request_timeout_seconds), 1.0)
347
+
348
+ try:
349
+ effective_model = validate_runtime_config(args.model)
350
+ except ValueError as error:
351
+ print(str(error), flush=True)
352
+ raise SystemExit(1) from error
353
+
354
+ _ = LOCAL_IMAGE_NAME
355
+
356
+ client = OpenAI(
357
+ base_url=API_BASE_URL,
358
+ api_key=API_KEY,
359
+ )
360
+
361
+ task_ids = [TASK_MAP[args.task]] if args.task in TASK_MAP else list(TASK_MAP.values())
362
+ for task_id in task_ids:
363
+ runtime_options = None
364
+ if task_id == "task_production":
365
+ runtime_options = {
366
+ "production_profile": args.production_profile,
367
+ "business_hours_mode": args.business_hours_mode,
368
+ "escalation_mode": args.escalation_mode,
369
+ }
370
+ for scenario_index in range(max(args.episodes_per_task, 1)):
371
+ run_episode(
372
+ client=client,
373
+ model_name=effective_model,
374
+ task_id=task_id,
375
+ scenario_index=scenario_index,
376
+ eval_split=args.split,
377
+ deadline=deadline,
378
+ request_timeout_seconds=request_timeout_seconds,
379
+ runtime_options=runtime_options,
380
+ )
381
+
382
+
383
+ if __name__ == "__main__":
384
+ main()
server.py ADDED
@@ -0,0 +1,775 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Flask server wrapper for the OpenEnv email triage environment."""
2
+
3
+ import os
4
+
5
+ from flask import Flask, Response, jsonify, request
6
+
7
+ from environment import EmailTriageEnv
8
+ from tasks import get_task_scenario_count, list_task_ids
9
+
10
+ FRONTEND_HTML = """<!doctype html>
11
+ <html lang="en">
12
+ <head>
13
+ <meta charset="utf-8" />
14
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
15
+ <title>Inbox Helper Practice</title>
16
+ <style>
17
+ @import url('https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@400;600;700&family=IBM+Plex+Mono:wght@400;500&display=swap');
18
+
19
+ :root {
20
+ --bg: #f5f1e9;
21
+ --paper: #fffaf2;
22
+ --ink: #102433;
23
+ --accent: #ea6a2a;
24
+ --accent-soft: #ffd6bf;
25
+ --line: #d7cabb;
26
+ --ok: #0f7b6c;
27
+ --warn: #9a3a12;
28
+ --radius: 14px;
29
+ }
30
+
31
+ * { box-sizing: border-box; }
32
+
33
+ body {
34
+ margin: 0;
35
+ font-family: 'Space Grotesk', sans-serif;
36
+ color: var(--ink);
37
+ background:
38
+ radial-gradient(1100px 460px at -10% -20%, #f2bc9f 0%, transparent 60%),
39
+ radial-gradient(1100px 520px at 120% 115%, #b8d7cf 0%, transparent 62%),
40
+ var(--bg);
41
+ min-height: 100vh;
42
+ }
43
+
44
+ .wrap {
45
+ max-width: 1100px;
46
+ margin: 28px auto;
47
+ padding: 0 16px;
48
+ animation: reveal .45s ease-out;
49
+ }
50
+
51
+ @keyframes reveal {
52
+ from { opacity: 0; transform: translateY(10px); }
53
+ to { opacity: 1; transform: translateY(0); }
54
+ }
55
+
56
+ .title {
57
+ display: flex;
58
+ justify-content: space-between;
59
+ align-items: baseline;
60
+ gap: 14px;
61
+ margin-bottom: 14px;
62
+ }
63
+
64
+ h1 {
65
+ margin: 0;
66
+ font-size: clamp(1.5rem, 2vw, 2.2rem);
67
+ letter-spacing: .4px;
68
+ }
69
+
70
+ .subtitle {
71
+ margin: 6px 0 0;
72
+ font-size: .95rem;
73
+ opacity: .8;
74
+ }
75
+
76
+ .badge {
77
+ background: var(--accent-soft);
78
+ border: 1px solid #f2b693;
79
+ color: #7f2e0b;
80
+ padding: 6px 10px;
81
+ border-radius: 999px;
82
+ font-size: .85rem;
83
+ font-weight: 600;
84
+ }
85
+
86
+ .grid {
87
+ display: grid;
88
+ grid-template-columns: 1fr;
89
+ gap: 14px;
90
+ }
91
+
92
+ @media (min-width: 900px) {
93
+ .grid { grid-template-columns: 1fr 1fr; }
94
+ .wide { grid-column: span 2; }
95
+ }
96
+
97
+ .card {
98
+ background: var(--paper);
99
+ border: 1px solid var(--line);
100
+ border-radius: var(--radius);
101
+ padding: 14px;
102
+ box-shadow: 0 8px 28px rgba(16, 36, 51, 0.08);
103
+ }
104
+
105
+ .card h2 {
106
+ margin: 0 0 10px;
107
+ font-size: 1rem;
108
+ text-transform: uppercase;
109
+ letter-spacing: .08em;
110
+ opacity: .86;
111
+ }
112
+
113
+ .row {
114
+ display: flex;
115
+ flex-wrap: wrap;
116
+ gap: 8px;
117
+ align-items: center;
118
+ margin-bottom: 10px;
119
+ }
120
+
121
+ select, input, textarea, button {
122
+ font-family: inherit;
123
+ font-size: .95rem;
124
+ }
125
+
126
+ select, input, textarea {
127
+ width: 100%;
128
+ border: 1px solid #cdbba6;
129
+ border-radius: 10px;
130
+ padding: 9px 10px;
131
+ background: #fff;
132
+ color: var(--ink);
133
+ }
134
+
135
+ textarea {
136
+ min-height: 92px;
137
+ resize: vertical;
138
+ }
139
+
140
+ button {
141
+ border: 0;
142
+ border-radius: 10px;
143
+ padding: 9px 12px;
144
+ font-weight: 700;
145
+ background: var(--ink);
146
+ color: #fff;
147
+ cursor: pointer;
148
+ transition: transform .12s ease, opacity .12s ease;
149
+ }
150
+
151
+ button.secondary {
152
+ background: #285066;
153
+ }
154
+
155
+ button.accent {
156
+ background: var(--accent);
157
+ }
158
+
159
+ button:hover { transform: translateY(-1px); }
160
+ button:active { transform: translateY(0); opacity: .92; }
161
+
162
+ .status {
163
+ padding: 8px 10px;
164
+ border-radius: 10px;
165
+ background: #eef7f5;
166
+ border: 1px solid #c7e4de;
167
+ color: var(--ok);
168
+ font-weight: 600;
169
+ min-height: 40px;
170
+ display: flex;
171
+ align-items: center;
172
+ }
173
+
174
+ .status.error {
175
+ background: #fff1ea;
176
+ border-color: #ffc8ae;
177
+ color: var(--warn);
178
+ }
179
+
180
+ pre {
181
+ margin: 0;
182
+ white-space: pre-wrap;
183
+ background: #0f1b24;
184
+ color: #d9efe9;
185
+ border-radius: 10px;
186
+ padding: 12px;
187
+ max-height: 340px;
188
+ overflow: auto;
189
+ font-family: 'IBM Plex Mono', monospace;
190
+ font-size: .85rem;
191
+ border: 1px solid #21313f;
192
+ }
193
+
194
+ .email-block {
195
+ background: #fff;
196
+ border: 1px solid #d9ccbc;
197
+ border-radius: 10px;
198
+ padding: 12px;
199
+ }
200
+
201
+ .email-row {
202
+ margin-bottom: 8px;
203
+ font-size: .95rem;
204
+ line-height: 1.35;
205
+ }
206
+
207
+ .email-row strong {
208
+ display: inline-block;
209
+ min-width: 66px;
210
+ }
211
+
212
+ .help {
213
+ margin: 0 0 10px;
214
+ font-size: .9rem;
215
+ opacity: .8;
216
+ }
217
+
218
+ .metric {
219
+ display: flex;
220
+ justify-content: space-between;
221
+ align-items: center;
222
+ margin-bottom: 8px;
223
+ padding-bottom: 6px;
224
+ border-bottom: 1px dashed #dbcfbe;
225
+ font-size: .95rem;
226
+ }
227
+
228
+ .metric strong {
229
+ font-weight: 700;
230
+ }
231
+
232
+ .coach {
233
+ background: #fff7ed;
234
+ border: 1px solid #f2caa9;
235
+ border-radius: 10px;
236
+ padding: 10px;
237
+ min-height: 74px;
238
+ line-height: 1.4;
239
+ font-size: .92rem;
240
+ }
241
+
242
+ .chip-row {
243
+ display: flex;
244
+ flex-wrap: wrap;
245
+ gap: 8px;
246
+ margin-top: 10px;
247
+ }
248
+
249
+ .chip {
250
+ background: #eaf3ff;
251
+ border: 1px solid #b9d1ef;
252
+ color: #184469;
253
+ border-radius: 999px;
254
+ padding: 6px 10px;
255
+ font-size: .84rem;
256
+ cursor: pointer;
257
+ font-weight: 600;
258
+ }
259
+ </style>
260
+ </head>
261
+ <body>
262
+ <div class="wrap">
263
+ <div class="title">
264
+ <div>
265
+ <h1>Inbox Helper Practice</h1>
266
+ <p class="subtitle">Practice deciding priority, category, and who should handle each email.</p>
267
+ </div>
268
+ <span class="badge" id="badge">connecting...</span>
269
+ </div>
270
+
271
+ <div class="grid">
272
+ <section class="card">
273
+ <h2>Start a Scenario</h2>
274
+ <p class="help">Pick a difficulty, then click Start.</p>
275
+ <div class="row">
276
+ <select id="taskId">
277
+ <option value="task_easy">Easy: one clear email</option>
278
+ <option value="task_medium">Medium: mixed inbox</option>
279
+ <option value="task_hard">Hard: high-risk complaint</option>
280
+ <option value="task_production">Production: full inbox simulator</option>
281
+ </select>
282
+ </div>
283
+ <div id="productionControls" style="display:none;">
284
+ <div class="row">
285
+ <select id="productionProfile">
286
+ <option value="light">Workload: Light</option>
287
+ <option value="standard" selected>Workload: Standard</option>
288
+ <option value="heavy">Workload: Heavy</option>
289
+ </select>
290
+ </div>
291
+ <div class="row">
292
+ <select id="businessHoursMode">
293
+ <option value="false" selected>Time Profile: 24x7 inbox</option>
294
+ <option value="true">Time Profile: business hours focus</option>
295
+ </select>
296
+ </div>
297
+ <div class="row">
298
+ <select id="escalationMode">
299
+ <option value="low">Escalation: Low</option>
300
+ <option value="normal" selected>Escalation: Normal</option>
301
+ <option value="high">Escalation: High</option>
302
+ </select>
303
+ </div>
304
+ </div>
305
+ <div class="row">
306
+ <button class="accent" id="btnReset">Start</button>
307
+ <button class="secondary" id="btnState">Check Progress</button>
308
+ </div>
309
+ <div class="status" id="status">Ready. Start a scenario.</div>
310
+ </section>
311
+
312
+ <section class="card">
313
+ <h2>Your Decision</h2>
314
+ <p class="help">Choose priority, who should handle it, and a short reason.</p>
315
+ <div class="row">
316
+ <select id="label">
317
+ <option value="urgent">Urgent</option>
318
+ <option value="normal" selected>Normal</option>
319
+ <option value="spam">Spam</option>
320
+ <option value="archive">Archive</option>
321
+ </select>
322
+ </div>
323
+ <div class="row">
324
+ <input id="routeTo" placeholder="Who should handle this? (billing, safety, engineering, support)" value="general" />
325
+ </div>
326
+ <div class="row">
327
+ <textarea id="summary" placeholder="Write one clear sentence with key clues from the email.">Needs review.</textarea>
328
+ </div>
329
+ <div class="row">
330
+ <button id="btnStep">Send Decision</button>
331
+ </div>
332
+ </section>
333
+
334
+ <section class="card wide">
335
+ <h2>Current Email</h2>
336
+ <div class="email-block">
337
+ <div class="email-row"><strong>Subject:</strong> <span id="mailSubject">No email loaded yet.</span></div>
338
+ <div class="email-row"><strong>From:</strong> <span id="mailSender">-</span></div>
339
+ <div class="email-row"><strong>Message:</strong> <span id="mailBody">Start a scenario to load an email.</span></div>
340
+ </div>
341
+ </section>
342
+
343
+ <section class="card">
344
+ <h2>Live Progress</h2>
345
+ <div class="metric"><span>Task</span><strong id="insightTask">-</strong></div>
346
+ <div class="metric"><span>Scenario</span><strong id="insightScenario">-</strong></div>
347
+ <div class="metric"><span>Progress</span><strong id="insightProgress">0/0</strong></div>
348
+ <div class="metric"><span>Last Reward</span><strong id="insightReward">-</strong></div>
349
+ <div class="metric"><span>Base Score</span><strong id="insightBase">-</strong></div>
350
+ </section>
351
+
352
+ <section class="card">
353
+ <h2>Coach Notes</h2>
354
+ <p class="help">Use this to improve your next triage action.</p>
355
+ <div class="coach" id="coachNotes">Start a scenario and submit one decision to get feedback.</div>
356
+ <div class="chip-row">
357
+ <button class="chip" id="chipSafety">Quick Fill: Urgent + Safety</button>
358
+ <button class="chip" id="chipBilling">Quick Fill: Normal + Billing</button>
359
+ <button class="chip" id="chipSpam">Quick Fill: Spam + General</button>
360
+ </div>
361
+ </section>
362
+
363
+ <section class="card wide">
364
+ <h2>Details (Advanced)</h2>
365
+ <pre id="output">Waiting for your first action...</pre>
366
+ </section>
367
+ </div>
368
+ </div>
369
+
370
+ <script>
371
+ const statusEl = document.getElementById('status');
372
+ const badgeEl = document.getElementById('badge');
373
+ const outEl = document.getElementById('output');
374
+ const mailSubjectEl = document.getElementById('mailSubject');
375
+ const mailSenderEl = document.getElementById('mailSender');
376
+ const mailBodyEl = document.getElementById('mailBody');
377
+ const taskIdEl = document.getElementById('taskId');
378
+ const productionControlsEl = document.getElementById('productionControls');
379
+ const insightTaskEl = document.getElementById('insightTask');
380
+ const insightScenarioEl = document.getElementById('insightScenario');
381
+ const insightProgressEl = document.getElementById('insightProgress');
382
+ const insightRewardEl = document.getElementById('insightReward');
383
+ const insightBaseEl = document.getElementById('insightBase');
384
+ const coachNotesEl = document.getElementById('coachNotes');
385
+
386
+ function setStatus(msg, isError = false) {
387
+ statusEl.textContent = msg;
388
+ statusEl.classList.toggle('error', isError);
389
+ }
390
+
391
+ function writeOutput(value) {
392
+ outEl.textContent = typeof value === 'string' ? value : JSON.stringify(value, null, 2);
393
+ }
394
+
395
+ function updateEmailPanel(data) {
396
+ if (!data || !data.observation) {
397
+ return;
398
+ }
399
+ const obs = data.observation;
400
+ mailSubjectEl.textContent = obs.subject || 'No subject';
401
+ mailSenderEl.textContent = obs.sender || '-';
402
+ mailBodyEl.textContent = obs.body || '';
403
+ }
404
+
405
+ function updateProductionControlsVisibility() {
406
+ const isProduction = taskIdEl.value === 'task_production';
407
+ productionControlsEl.style.display = isProduction ? 'block' : 'none';
408
+ }
409
+
410
+ function safeNumber(value) {
411
+ return typeof value === 'number' && !Number.isNaN(value) ? value : null;
412
+ }
413
+
414
+ function updateInsights(data) {
415
+ const info = (data && data.info) ? data.info : {};
416
+ const taskValue = info.task_id || data.task_id || (data.observation && data.observation.task_id) || '-';
417
+ const scenarioValue = info.scenario_id || '-';
418
+
419
+ insightTaskEl.textContent = taskValue;
420
+ insightScenarioEl.textContent = scenarioValue;
421
+
422
+ const emailsProcessed = safeNumber(info.emails_processed);
423
+ const emailsTotal = safeNumber(info.emails_total);
424
+ if (emailsProcessed !== null && emailsTotal !== null) {
425
+ insightProgressEl.textContent = `${emailsProcessed}/${emailsTotal}`;
426
+ } else if (safeNumber(data.current_step) !== null && safeNumber(data.total_steps) !== null) {
427
+ insightProgressEl.textContent = `${data.current_step}/${data.total_steps}`;
428
+ }
429
+
430
+ const rewardValue = safeNumber(data.reward);
431
+ insightRewardEl.textContent = rewardValue !== null ? rewardValue.toFixed(6) : '-';
432
+
433
+ const baseScoreValue = safeNumber(info.base_score);
434
+ insightBaseEl.textContent = baseScoreValue !== null ? baseScoreValue.toFixed(6) : '-';
435
+
436
+ const tips = [];
437
+ if (info.validation_error) {
438
+ tips.push('Action format is invalid. Keep label/summary/route_to filled correctly.');
439
+ }
440
+
441
+ const routeNoise = safeNumber(info.grade_route_noise_penalty);
442
+ if (routeNoise !== null && routeNoise > 0.01) {
443
+ tips.push('Route to one best owner team. Avoid sending to many teams at once.');
444
+ }
445
+
446
+ const summaryMatch = safeNumber(info.grade_summary_match);
447
+ if (summaryMatch !== null && summaryMatch < 0.6) {
448
+ tips.push('Summary is weak. Include concrete clues from subject/body/thread.');
449
+ }
450
+
451
+ const labelMatch = safeNumber(info.grade_label_match);
452
+ if (labelMatch !== null && labelMatch < 1.0) {
453
+ tips.push('Priority label may be off. Re-check urgency and risk signals.');
454
+ }
455
+
456
+ const routeMatch = safeNumber(info.grade_route_match);
457
+ if (routeMatch !== null && routeMatch < 1.0) {
458
+ tips.push('Routing looks off. Pick the team that directly owns this issue.');
459
+ }
460
+
461
+ const urgencyComponent = safeNumber(info.grade_urgency_component);
462
+ if (urgencyComponent !== null && urgencyComponent < 0.2) {
463
+ tips.push('For high-risk complaints, mark urgent and route to safety first.');
464
+ }
465
+
466
+ if (!tips.length && typeof info.grading_feedback === 'string' && info.grading_feedback) {
467
+ tips.push(info.grading_feedback);
468
+ }
469
+
470
+ coachNotesEl.textContent = tips.length
471
+ ? tips.join(' ')
472
+ : 'Looks good. Keep your next route precise and your summary evidence-based.';
473
+ }
474
+
475
+ function prefillAction(label, routeTo, summary) {
476
+ document.getElementById('label').value = label;
477
+ document.getElementById('routeTo').value = routeTo;
478
+ document.getElementById('summary').value = summary;
479
+ }
480
+
481
+ async function postJson(path, payload) {
482
+ const response = await fetch(path, {
483
+ method: 'POST',
484
+ headers: { 'Content-Type': 'application/json' },
485
+ body: JSON.stringify(payload || {}),
486
+ });
487
+ const text = await response.text();
488
+ let data = text;
489
+ try { data = JSON.parse(text); } catch (e) {}
490
+ if (!response.ok) {
491
+ throw new Error('HTTP ' + response.status + ' - ' + text);
492
+ }
493
+ return data;
494
+ }
495
+
496
+ async function warmup() {
497
+ try {
498
+ const res = await fetch('/meta');
499
+ const data = await res.json();
500
+ badgeEl.textContent = data.status === 'ok' ? 'ready' : 'check service';
501
+ } catch (e) {
502
+ badgeEl.textContent = 'offline';
503
+ }
504
+ }
505
+
506
+ document.getElementById('btnReset').addEventListener('click', async () => {
507
+ const taskId = taskIdEl.value;
508
+ setStatus('Starting a new scenario...');
509
+ try {
510
+ const payload = { task_id: taskId };
511
+ if (taskId === 'task_production') {
512
+ payload.production_profile = document.getElementById('productionProfile').value;
513
+ payload.business_hours_mode = document.getElementById('businessHoursMode').value === 'true';
514
+ payload.escalation_mode = document.getElementById('escalationMode').value;
515
+ }
516
+ const data = await postJson('/reset', payload);
517
+ setStatus('Scenario started. Read the email below.');
518
+ updateEmailPanel(data);
519
+ updateInsights(data);
520
+ writeOutput(data);
521
+ } catch (e) {
522
+ setStatus('Could not start scenario. See details below.', true);
523
+ writeOutput(String(e));
524
+ }
525
+ });
526
+
527
+ document.getElementById('btnState').addEventListener('click', async () => {
528
+ setStatus('Checking progress...');
529
+ try {
530
+ const data = await postJson('/state', {});
531
+ setStatus('Progress updated.');
532
+ updateInsights(data);
533
+ writeOutput(data);
534
+ } catch (e) {
535
+ setStatus('Could not fetch progress. See details below.', true);
536
+ writeOutput(String(e));
537
+ }
538
+ });
539
+
540
+ document.getElementById('btnStep').addEventListener('click', async () => {
541
+ const payload = {
542
+ label: document.getElementById('label').value,
543
+ summary: document.getElementById('summary').value,
544
+ route_to: document.getElementById('routeTo').value,
545
+ };
546
+ setStatus('Sending your decision...');
547
+ try {
548
+ const data = await postJson('/step', payload);
549
+ setStatus('Decision saved.');
550
+ updateEmailPanel(data);
551
+ updateInsights(data);
552
+ writeOutput(data);
553
+ } catch (e) {
554
+ setStatus('Could not submit decision. See details below.', true);
555
+ writeOutput(String(e));
556
+ }
557
+ });
558
+
559
+ document.getElementById('chipSafety').addEventListener('click', () => {
560
+ prefillAction('urgent', 'safety', 'Potential safety risk with immediate escalation needed.');
561
+ });
562
+
563
+ document.getElementById('chipBilling').addEventListener('click', () => {
564
+ prefillAction('normal', 'billing', 'Customer billing issue needs finance team review and response.');
565
+ });
566
+
567
+ document.getElementById('chipSpam').addEventListener('click', () => {
568
+ prefillAction('spam', 'general', 'Likely phishing or irrelevant message with suspicious external request.');
569
+ });
570
+
571
+ taskIdEl.addEventListener('change', updateProductionControlsVisibility);
572
+
573
+ updateProductionControlsVisibility();
574
+ warmup();
575
+ </script>
576
+ </body>
577
+ </html>
578
+ """
579
+
580
+ app = Flask(__name__)
581
+ current_env = EmailTriageEnv(task_id="task_easy")
582
+ SCENARIO_COUNTERS = {task_id: 0 for task_id in list_task_ids()}
583
+ DEFAULT_EVAL_SPLIT = os.getenv("OPENENV_EVAL_SPLIT", "public")
584
+ ALLOW_CLIENT_EVAL_OVERRIDE = (
585
+ os.getenv("OPENENV_ALLOW_CLIENT_EVAL_OVERRIDE", "false").strip().lower() == "true"
586
+ )
587
+
588
+
589
+ @app.get("/")
590
+ def root_page():
591
+ """Render a lightweight frontend for interacting with the environment."""
592
+ return Response(FRONTEND_HTML, mimetype="text/html")
593
+
594
+
595
+ @app.get("/meta")
596
+ def root_endpoint():
597
+ """Return service metadata for health checks and machine clients."""
598
+ return jsonify(
599
+ {
600
+ "name": "email-triage-env",
601
+ "status": "ok",
602
+ "endpoints": {
603
+ "reset": {"method": "POST", "path": "/reset"},
604
+ "step": {"method": "POST", "path": "/step"},
605
+ "state": {"method": "POST", "path": "/state"},
606
+ },
607
+ "scenario_pools": {
608
+ "public": {
609
+ task_id: get_task_scenario_count(task_id, "public")
610
+ for task_id in list_task_ids()
611
+ },
612
+ },
613
+ "eval_split": DEFAULT_EVAL_SPLIT,
614
+ "production_runtime_controls": {
615
+ "production_profile": ["light", "standard", "heavy"],
616
+ "business_hours_mode": [True, False],
617
+ "escalation_mode": ["low", "normal", "high"],
618
+ },
619
+ }
620
+ )
621
+
622
+
623
+ @app.post("/reset")
624
+ def reset_endpoint():
625
+ """Reset the environment with a selected task and return ResetResult JSON.
626
+
627
+ Returns:
628
+ Flask response containing reset payload.
629
+ """
630
+ global current_env
631
+ global SCENARIO_COUNTERS
632
+
633
+ payload = request.get_json(silent=True)
634
+ if payload is None:
635
+ payload = {}
636
+ elif not isinstance(payload, dict):
637
+ return jsonify({"error": "Malformed JSON payload."}), 400
638
+
639
+ task_id = payload.get("task_id", "task_easy")
640
+ if not isinstance(task_id, str):
641
+ return jsonify({"error": "Field 'task_id' must be a string."}), 400
642
+
643
+ runtime_options: dict[str, object] = {}
644
+ if task_id == "task_production":
645
+ production_profile = payload.get("production_profile", "standard")
646
+ if not isinstance(production_profile, str) or production_profile not in {
647
+ "light",
648
+ "standard",
649
+ "heavy",
650
+ }:
651
+ return (
652
+ jsonify(
653
+ {
654
+ "error": (
655
+ "Field 'production_profile' must be one of "
656
+ "light/standard/heavy."
657
+ )
658
+ }
659
+ ),
660
+ 400,
661
+ )
662
+
663
+ escalation_mode = payload.get("escalation_mode", "normal")
664
+ if not isinstance(escalation_mode, str) or escalation_mode not in {
665
+ "low",
666
+ "normal",
667
+ "high",
668
+ }:
669
+ return (
670
+ jsonify(
671
+ {
672
+ "error": (
673
+ "Field 'escalation_mode' must be one of "
674
+ "low/normal/high."
675
+ )
676
+ }
677
+ ),
678
+ 400,
679
+ )
680
+
681
+ business_hours_mode = payload.get("business_hours_mode", False)
682
+ if isinstance(business_hours_mode, str):
683
+ business_hours_mode = business_hours_mode.strip().lower() in {
684
+ "1",
685
+ "true",
686
+ "yes",
687
+ "on",
688
+ }
689
+ elif not isinstance(business_hours_mode, bool):
690
+ return jsonify({"error": "Field 'business_hours_mode' must be boolean."}), 400
691
+
692
+ runtime_options = {
693
+ "production_profile": production_profile,
694
+ "business_hours_mode": business_hours_mode,
695
+ "escalation_mode": escalation_mode,
696
+ }
697
+
698
+ if not ALLOW_CLIENT_EVAL_OVERRIDE and (
699
+ "eval_split" in payload or "scenario_index" in payload
700
+ ):
701
+ return jsonify(
702
+ {
703
+ "error": (
704
+ "Client overrides for eval_split/scenario_index are disabled "
705
+ "by server policy."
706
+ )
707
+ }
708
+ ), 400
709
+
710
+ eval_split = DEFAULT_EVAL_SPLIT
711
+ if ALLOW_CLIENT_EVAL_OVERRIDE:
712
+ requested_split = payload.get("eval_split", DEFAULT_EVAL_SPLIT)
713
+ if not isinstance(requested_split, str):
714
+ return jsonify({"error": "Field 'eval_split' must be a string."}), 400
715
+ eval_split = requested_split
716
+
717
+ requested_index = payload.get("scenario_index") if ALLOW_CLIENT_EVAL_OVERRIDE else None
718
+ if requested_index is not None and (not isinstance(requested_index, int) or requested_index < 0):
719
+ return jsonify({"error": "Field 'scenario_index' must be a non-negative integer."}), 400
720
+
721
+ try:
722
+ scenario_count = get_task_scenario_count(task_id, eval_split)
723
+ if requested_index is None:
724
+ scenario_index = SCENARIO_COUNTERS.get(task_id, 0)
725
+ if scenario_count > 0:
726
+ SCENARIO_COUNTERS[task_id] = (scenario_index + 1) % scenario_count
727
+ else:
728
+ scenario_index = requested_index
729
+
730
+ current_env = EmailTriageEnv(
731
+ task_id=task_id,
732
+ scenario_index=scenario_index,
733
+ split=eval_split,
734
+ runtime_options=runtime_options,
735
+ )
736
+ reset_result = current_env.reset()
737
+ except KeyError as error:
738
+ return jsonify({"error": str(error)}), 400
739
+
740
+ return jsonify(reset_result.model_dump())
741
+
742
+
743
+ @app.post("/step")
744
+ def step_endpoint():
745
+ """Advance environment by one action and return StepResult JSON.
746
+
747
+ Returns:
748
+ Flask response containing step payload.
749
+ """
750
+ payload = request.get_json(silent=True)
751
+ if payload is None:
752
+ return jsonify({"error": "Malformed JSON payload."}), 400
753
+
754
+ step_result = current_env.step(payload)
755
+ return jsonify(step_result.model_dump())
756
+
757
+
758
+ @app.post("/state")
759
+ def state_endpoint():
760
+ """Return read-only EnvironmentState JSON snapshot.
761
+
762
+ Returns:
763
+ Flask response containing state payload.
764
+ """
765
+ state_result = current_env.state()
766
+ return jsonify(state_result.model_dump())
767
+
768
+
769
+ def main() -> None:
770
+ """Run the Flask app for local and script-based launches."""
771
+ app.run(host="0.0.0.0", port=7860)
772
+
773
+
774
+ if __name__ == "__main__":
775
+ main()