Akshay Babbar commited on
Commit
98a5a8c
·
0 Parent(s):

chore: HF Space export (size filter)

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .cursor/BUGBOT.md +102 -0
  2. .dockerignore +28 -0
  3. .gitignore +31 -0
  4. .openenvignore +8 -0
  5. Dockerfile +40 -0
  6. LICENSE.md +201 -0
  7. README.md +344 -0
  8. REPRODUCIBILITY.md +406 -0
  9. __init__.py +0 -0
  10. app_gradio.py +408 -0
  11. blog.md +257 -0
  12. budget_router/__init__.py +17 -0
  13. budget_router/client.py +29 -0
  14. budget_router/environment.py +515 -0
  15. budget_router/models.py +212 -0
  16. budget_router/policies.py +141 -0
  17. budget_router/reward.py +281 -0
  18. budget_router/tasks.py +108 -0
  19. budget_router/tests/__init__.py +1 -0
  20. budget_router/tests/test_environment.py +502 -0
  21. budget_router/tests/test_eval_all_seed_selection.py +41 -0
  22. budget_router/tests/test_grpo_training_reward.py +154 -0
  23. budget_router/tests/test_inference_prompt.py +94 -0
  24. budget_router/tests/test_trace_episode.py +43 -0
  25. budget_router/tests/test_validation.py +140 -0
  26. budget_router/validation.py +424 -0
  27. check_leak.py +181 -0
  28. client.py +3 -0
  29. eval/eval_all.py +306 -0
  30. eval/eval_all.sh +116 -0
  31. eval/outputs/prompt_audit/belief_v1_dev10/eval_results_20260425_160429.json +1188 -0
  32. eval/outputs/prompt_audit/belief_v1_dev10/eval_summary_20260425_160429.md +5 -0
  33. eval/outputs/prompt_audit/belief_v1_heldout5/eval_results_20260425_160016.json +615 -0
  34. eval/outputs/prompt_audit/belief_v1_heldout5/eval_summary_20260425_160016.md +5 -0
  35. eval/outputs/prompt_audit/budget_guard_alltasks_dev3/eval_results_20260425_165910.json +1468 -0
  36. eval/outputs/prompt_audit/budget_guard_alltasks_dev3/eval_summary_20260425_165910.md +8 -0
  37. eval/outputs/prompt_audit/budget_guard_dev10/eval_results_20260425_164343.json +1202 -0
  38. eval/outputs/prompt_audit/budget_guard_dev10/eval_summary_20260425_164343.md +5 -0
  39. eval/outputs/prompt_audit/budget_guard_heldout5/eval_results_20260425_163956.json +617 -0
  40. eval/outputs/prompt_audit/budget_guard_heldout5/eval_summary_20260425_163956.md +5 -0
  41. eval/outputs/trace_compare/eval_seed101/eval_results_20260425_192545.json +149 -0
  42. eval/outputs/trace_compare/eval_seed101/eval_results_20260425_192656.json +149 -0
  43. eval/outputs/trace_compare/eval_seed101/eval_summary_20260425_192545.md +5 -0
  44. eval/outputs/trace_compare/eval_seed101/eval_summary_20260425_192656.md +5 -0
  45. eval/trace_episode.py +357 -0
  46. eval_sft.py +488 -0
  47. generate_sft_data.py +361 -0
  48. gradio_ui/__init__.py +0 -0
  49. gradio_ui/config.py +19 -0
  50. gradio_ui/legacy_api.py +56 -0
.cursor/BUGBOT.md ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Bugbot review charter — Budget Router (OpenEnv)
2
+
3
+ ## North star
4
+
5
+ This codebase is an **OpenEnv-style RL / agent environment**: correctness of the
6
+ simulation, inference path, and evaluation harness is **non-negotiable**. Treat
7
+ every change first as a **risk to invariants**, second as a product of intent.
8
+
9
+ **Priority order (strict):**
10
+
11
+ 1. **Factual and behavioral accuracy** — claims, metrics, seeds, APIs, and
12
+ documented procedures must remain true and reproducible.
13
+ 2. **Regression safety** — no silent change to reward semantics, observation
14
+ space, routing contracts, seed selection, or eval aggregation unless
15
+ explicitly justified and reflected in docs.
16
+ 3. **Everything else** — including new features, refactors, and ergonomics —
17
+ only after the above are satisfied.
18
+
19
+ If a change improves developer experience or adds capability but **weakens
20
+ traceability, determinism, or agreement with the published contract**, treat that
21
+ as a **defect**, not a win.
22
+
23
+ ---
24
+
25
+ ## Evidence contract
26
+
27
+ `README_v1.md` is the **published evidence layer** for this repository: benchmark
28
+ definitions, honest scope, statistical reporting, seed buckets, and
29
+ environmental assumptions. It is not marketing copy; it is the **external
30
+ interface of trust**.
31
+
32
+ When reviewing a pull request:
33
+
34
+ - Assume reviewers and downstream users will reconcile the diff against
35
+ **`README_v1.md`** and the **test suite**, not against intent expressed only in
36
+ comments or chat.
37
+ - Flag any drift between **implementation**, **eval scripts**, and **documented
38
+ claims** as a **primary finding**, not a footnote.
39
+ - Prefer **blocking** feedback when the PR could make a true statement in
40
+ `README_v1.md` false, ambiguous, or non-reproducible without a coordinated doc
41
+ update.
42
+
43
+ ---
44
+
45
+ ## Regression lens (main code and agent path)
46
+
47
+ Evaluate from the perspective of **“what breaks for callers?”** — the Gradio /
48
+ server surface, the environment stepping contract, inference and routing logic,
49
+ and anything an **agent** (heuristic, LLM, or RL policy) depends on.
50
+
51
+ Elevate severity when the change touches or could affect:
52
+
53
+ - **Reward / termination / budget / SLA semantics** — any path that alters
54
+ episode economics without a clear, tested migration story.
55
+ - **Observations and action validity** — shapes, bounds, masking, or
56
+ interpretation of noisy signals the agent is documented to use.
57
+ - **Provider degradation or non-stationarity** — ordering, timing, or randomness
58
+ that shifts the task without explicit versioning or changelog discipline.
59
+ - **Evaluation** — `eval/` entrypoints, seed handling, aggregation, baselines, and
60
+ anything that feeds headline numbers or comparisons in `README_v1.md`.
61
+ - **Determinism and auditability** — anything that makes prior results
62
+ incomparable across commits without saying so.
63
+
64
+ Ask explicitly: **If we merge this, can a user still run the same commands and
65
+ obtain a result that is fairly comparable to what the README describes?** If the
66
+ answer is “only sometimes” or “only with undocumented flags,” that is a **merge
67
+ risk**.
68
+
69
+ ---
70
+
71
+ ## Code review bar
72
+
73
+ Hold the diff to a **high-trust research engineering** standard:
74
+
75
+ - **Invariants first** — state what must remain true; show how the change
76
+ preserves or formally relaxes it.
77
+ - **Proof over taste** — prefer runnable tests, property checks, or minimal
78
+ reproductions over stylistic preference. Style matters only where it prevents
79
+ bugs (e.g., unclear units, magic numbers without provenance).
80
+ - **Minimal blast radius** — favor localized, reversible changes; be skeptical of
81
+ drive-by refactors bundled with behavioral edits.
82
+ - **Failure modes** — consider partial deploys, missing API keys, degraded
83
+ backends, and off-by-one episode boundaries as first-class scenarios when
84
+ relevant.
85
+
86
+ Do **not** optimize review comments for velocity of shipping features. Optimize
87
+ for **confidence that main remains a reliable substrate for agents and eval**.
88
+
89
+ ---
90
+
91
+ ## What “approve” means here
92
+
93
+ A non-issue or acceptable change is one that **preserves or strengthens** the
94
+ truth and stability story relative to `README_v1.md` and existing tests.
95
+
96
+ A blocking issue is one that **could** — even in edge cases — produce **wrong
97
+ results**, **misleading comparisons**, **undocumented behavior change**, or
98
+ **silent regression** in core or agent-facing paths without a commensurate,
99
+ explicit update to evidence and tests.
100
+
101
+ When uncertain, **assume the worst plausible interpretation** for merge safety,
102
+ state the assumption, and recommend what evidence would resolve it.
.dockerignore ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .venv/
2
+ .git/
3
+ __pycache__/
4
+ *.pyc
5
+ .pytest_cache/
6
+ .ruff_cache/
7
+ docs/*.png
8
+ **/*.png
9
+ *.tar.gz
10
+ *.egg-info/
11
+ .env
12
+ .claude/
13
+ .windsurf/
14
+ .hf_private/
15
+ .DS_Store
16
+ artifacts/
17
+ *.zip
18
+ *.json
19
+ **/*.json
20
+ *.txt
21
+ **/*.txt
22
+
23
+ .hackathon_context/
24
+ README_archived.md
25
+ llm_stderr.log
26
+ pre_validation.sh
27
+ test_docker_step.sh
28
+ trained_models/
.gitignore ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .venv/
2
+ __pycache__/
3
+ *.pyc
4
+ *.pyo
5
+ *.pyd
6
+ .Python
7
+ *.so
8
+ .env
9
+ .hf_private/
10
+ .windsurf/
11
+ .matplotlib/
12
+ .DS_Store
13
+ *.egg-info/
14
+ dist/
15
+ build/
16
+ *.png
17
+ !figures/budget_router_evidence.png
18
+ ._*
19
+ merged_codebase.txt
20
+ docs/
21
+ /*.json
22
+ *.tar.gz
23
+ /*.txt
24
+ README_archived.md
25
+ llm_stderr.log
26
+ pre_validation.sh
27
+ test_docker_step.sh
28
+ .hackathon_context/
29
+ outputs/*/
30
+ .cursor/
31
+ .docs/
.openenvignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ *.png
2
+ *.tar.gz
3
+ README_archived.md
4
+ *.json
5
+ docs/
6
+ *.txt
7
+ *.zip
8
+ trained_models/
Dockerfile ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
2
+ FROM ${BASE_IMAGE} AS builder
3
+
4
+ WORKDIR /app
5
+
6
+ RUN apt-get update && \
7
+ apt-get install -y --no-install-recommends git curl && \
8
+ rm -rf /var/lib/apt/lists/*
9
+
10
+ COPY . /app/env
11
+ WORKDIR /app/env
12
+
13
+ RUN if ! command -v uv >/dev/null 2>&1; then \
14
+ curl -LsSf https://astral.sh/uv/install.sh | sh && \
15
+ mv /root/.local/bin/uv /usr/local/bin/uv && \
16
+ mv /root/.local/bin/uvx /usr/local/bin/uvx; \
17
+ fi
18
+
19
+ RUN --mount=type=cache,target=/root/.cache/uv \
20
+ uv sync --extra training --no-install-project --no-editable
21
+
22
+ RUN --mount=type=cache,target=/root/.cache/uv \
23
+ uv sync --extra training --no-editable
24
+
25
+ FROM ${BASE_IMAGE}
26
+
27
+ WORKDIR /app
28
+
29
+ COPY --from=builder /app/env/.venv /app/.venv
30
+ COPY --from=builder /app/env /app/env
31
+
32
+ ENV PATH="/app/.venv/bin:$PATH"
33
+ ENV PYTHONPATH="/app/env:$PYTHONPATH"
34
+
35
+ EXPOSE 8000
36
+
37
+ HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
38
+ CMD python -c "import os, urllib.request; port = os.environ.get('PORT', '8000'); urllib.request.urlopen(f'http://127.0.0.1:{port}/health', timeout=2)"
39
+
40
+ CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port ${PORT:-8000} --proxy-headers --forwarded-allow-ips='*'"]
LICENSE.md ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
README.md ADDED
@@ -0,0 +1,344 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: "Budget Router"
3
+ emoji: "⚙️"
4
+ colorFrom: purple
5
+ colorTo: indigo
6
+ sdk: docker
7
+ app_port: 8000
8
+ base_path: /web
9
+ pinned: false
10
+ ---
11
+
12
+ # Budget Router (OpenEnv)
13
+
14
+ Budget Router is an OpenEnv-compliant RL environment where an agent routes requests to one of three providers (A/B/C) or sheds load under a tight **cost–reliability–SLA** trade-off. Providers degrade non-stationarily within an episode; the agent observes only a noisy windowed success signal (rolling success rate), not true internal health.
15
+
16
+ [![HF Space](https://img.shields.io/badge/🤗-Live%20Demo-yellow)](https://huggingface.co/spaces/akshay4/budget-router-openenv)
17
+
18
+ ## TL;DR
19
+
20
+ **Hard_Multi is the headline scenario**: when Provider A degrades from step 0 and
21
+ Provider B cascades at step 10, reactive policies go negative while adaptive ones
22
+ stay positive. Three policy families, each stronger than the last, validated
23
+ across **30 paired seeds** in three independent buckets (dev, heldout, fresh):
24
+
25
+ | Policy | Hard_Multi grader | vs heuristic | Statistical evidence |
26
+ |---|---:|---|---|
27
+ | Heuristic (reactive) | 0.6076 ± 0.0361 (n=30) | — | — |
28
+ | LLM — Qwen2.5-72B + budget-guard | 0.6515 ± 0.0523 (n=30) | **+7.2 %** | Cohen's d = **1.135** (large), paired one-sided p < 1×10⁻⁶, 24/30 wins, bootstrap 95 % CI on Δ = [0.031, 0.058] |
29
+ | PPO — SB3, 100k steps | **0.6907 ± 0.0326** (n=10 dev) | **+13.6 %** | 95 % CI [0.667, 0.714], **non-overlapping with heuristic**, 10/10 wins |
30
+
31
+ **Mechanism** (PPO): the agent learned to route A→B early and conserve budget
32
+ before B's cascade at step 10, pushing `adaptation_score` from 0.6907 (heuristic)
33
+ to **0.9328** — a +0.2421 gain on the grader's most diagnostic sub-score. The
34
+ LLM achieves a milder version of the same effect (+0.124 adaptation gain
35
+ across n=30) by anticipating the cascade in-context.
36
+
37
+ **Environment hardness**: heuristic reward goes negative (−2.97) on
38
+ Hard_Multi while oracle reaches +4.10 — a 7.07-point gap (≈238 % of the
39
+ heuristic's absolute reward) that confirms the cascade task is hard enough
40
+ to require RL/in-context reasoning and learnable enough to reward it.
41
+
42
+ **Honest scope** (explicitly disclosed):
43
+ - The LLM uses a deterministic **budget-safety guard** that vetoes routes which
44
+ would bankrupt the budget — a standard agentic-system pattern (LLM for
45
+ high-level decisions, deterministic layer for arithmetic-critical safety).
46
+ Without the guard, raw LLM occasionally exhausts budget and incurs the −10
47
+ cliff penalty.
48
+ - LLM (with guard) wins on **3 of 4 task tiers**: Medium (+5.8 %), Hard (+7.5 %),
49
+ Hard_Multi (+11.0 %). Loses Easy by −4.6 % — on a task with no degradation,
50
+ the budget-conservative heuristic is near-optimal and the LLM's added
51
+ flexibility is unhelpful.
52
+ - PPO is trained and evaluated on **Hard_Multi only**; not a general-purpose
53
+ policy. This is a deliberate choice — Hard_Multi has a 238 % oracle/heuristic
54
+ gap, the largest in the suite, so RL signal is highest there.
55
+ - All non-trivial improvement claims come from seeds the policy never saw
56
+ during design (heldout 100–109, fresh 200–209). Dev-seed wins are reported
57
+ separately and never used to make the headline claim.
58
+
59
+ ## Run locally
60
+ **Enable LLM policy locally**:
61
+
62
+ ```bash
63
+ export API_BASE_URL="https://<openai-compatible-endpoint>/v1" # e.g. https://router.huggingface.co/v1
64
+ export API_KEY="<your_key>"
65
+ export MODEL_NAME="<model_id>" # optional (e.g. Qwen/Qwen2.5-72B-Instruct)
66
+ ```
67
+
68
+
69
+ ```bash
70
+ uv sync --extra training
71
+ uv run server
72
+ ```
73
+
74
+ Then open `http://127.0.0.1:8000/web` for the Gradio dashboard.
75
+
76
+ To **reproduce or regenerate** the evaluation numbers, traces, PPO workflow, and optional GRPO checks, follow the command checklist in [`REPRODUCIBILITY.md`](REPRODUCIBILITY.md) (companion to the optional `<details>` blocks below).
77
+
78
+
79
+
80
+ To **reproduce or regenerate** the evaluation numbers, traces, PPO workflow, and optional GRPO checks, follow the command checklist in [`REPRODUCIBILITY.md`](REPRODUCIBILITY.md) (companion to the optional `<details>` blocks below).
81
+
82
+
83
+ ## Benchmark results
84
+
85
+ Three policies evaluated:
86
+
87
+ - **Heuristic**: budget-aware, cheapest-viable baseline using only public
88
+ observations (`budget_router/policies.py`).
89
+ - **LLM**: Qwen2.5-72B via HuggingFace Inference Router, wrapped with a
90
+ deterministic budget-safety guard (`inference.py::_apply_budget_safety_guard`).
91
+ - **PPO**: MlpPolicy trained with Stable-Baselines3 on Hard_Multi (100k steps,
92
+ 4 parallel envs). See `train/train_ppo_hard_multi.py`.
93
+ - **Oracle†**: privileged upper-bound with internal-state access,
94
+ validation-only, not reported in tables.
95
+
96
+ **Dev seeds (0–9), full task suite** — `outputs/freeze_check_alltasks_dev10/eval_summary_*.md`:
97
+
98
+ | Task | Heuristic | LLM | PPO | LLM Δ vs heuristic |
99
+ |---|---:|---:|---:|---|
100
+ | Easy | 0.7718 | 0.7360 | — | −4.6 % *(7 losses, 0 wins, 3 ties)* |
101
+ | Medium | 0.6852 | 0.7250 | — | **+5.8 %** *(9 wins, 0 losses, 1 tie)* |
102
+ | Hard | 0.6354 | 0.6832 | — | **+7.5 %** *(8 wins, 2 losses, 0 ties)* |
103
+ | Hard_Multi | 0.6078 | 0.6746 | **0.6907** | **+11.0 %** *(8 wins, 1 loss, 1 tie)* |
104
+
105
+ PPO was trained and evaluated on Hard_Multi only; Easy/Medium/Hard cells are
106
+ intentionally blank (no model for those tasks).
107
+
108
+ **Statistical evidence — Hard_Multi** (`outputs/freeze_check_*/eval_results_*.json`,
109
+ `outputs/ppo_hard_multi_eval.json`):
110
+
111
+ | | Heuristic | LLM | PPO |
112
+ |---|---|---|---|
113
+ | Mean grader | 0.6076 ± 0.0361 (n=30) | 0.6515 ± 0.0523 (n=30) | 0.6907 ± 0.0326 (n=10) |
114
+ | Bootstrap 95 % CI | [0.595, 0.620] | [0.633, 0.670] | [0.667, 0.714] |
115
+ | Paired Δ vs heuristic | — | +0.0440 (boot 95 % CI [0.031, 0.058]) | +0.0829 |
116
+ | **Cohen's d (paired)** | — | **1.135 (LARGE)** | **≈ 2.4 (HUGE)** |
117
+ | Paired one-sided p | — | **< 1 × 10⁻⁶** (Welch t = 6.22, df = 29) | (10/10 wins) |
118
+ | Sign-test wins / ties / losses | — | **24 / 3 / 3** | 10 / 0 / 0 |
119
+ | P(LLM > heuristic) — Agarwal 2021 | — | **0.80** | 1.00 |
120
+ | IQM of paired Δ — Agarwal 2021 | — | +0.040 (trimmed 25 %) | — |
121
+ | 95 % CI overlap with heuristic | — | None on the Δ | **None on the means** |
122
+ | Adaptation sub-score (mean) | 0.6878 | 0.8115 | **0.9328** |
123
+
124
+ **Per-bucket reproduction** (each row independent; LLM and heuristic share seeds,
125
+ so deltas are paired):
126
+
127
+ | Bucket | Seeds | Heuristic | LLM | Δ (rel %) | Wins / Ties / Losses |
128
+ |---|---|---:|---:|---:|---:|
129
+ | Dev | 0–9 | 0.6078 ± 0.0382 | 0.6746 ± 0.0486 | +0.0668 (+11.0 %) | 8 / 1 / 1 |
130
+ | **Heldout** | 100–109 | 0.6064 ± 0.0419 | 0.6454 ± 0.0497 | **+0.0390 (+6.4 %)** | **8 / 2 / 0** |
131
+ | **Fresh** | 200–209 | 0.6086 ± 0.0314 | 0.6347 ± 0.0551 | **+0.0261 (+4.3 %)** | **8 / 0 / 2** |
132
+ | **Combined non-dev** | 100–109 + 200–209 | 0.6075 | 0.6401 | **+0.0326 (+5.4 %)** | **16 / 2 / 2** |
133
+
134
+ ![Budget Router Evidence](figures/budget_router_evidence.png)
135
+ *Figure: (top-left) LLM advantage grows with task difficulty; (top-right)
136
+ three-policy ordering on Hard_Multi with non-overlapping 95% CIs;
137
+ (bottom-left) generalization across independent seed buckets including
138
+ post-freeze fresh seeds; (bottom-right) adaptation sub-score is the
139
+ primary driver of LLM and PPO gains over the reactive heuristic.*
140
+
141
+ The fresh-seed bucket (200–209) was added *after* the LLM prompt and budget
142
+ guard were frozen. It exists specifically to falsify a "tuned-on-heldout"
143
+ critique. The effect persists with no overlap to zero in the bootstrap CI.
144
+
145
+ <details>
146
+ <summary>🔬 Reproducing PPO Results (Optional)</summary>
147
+
148
+ The trained PPO policy for the hard_multi scenario is included at
149
+ `trained_models/ppo_hard_multi_100k.zip` (143 KB, trained 100k steps).
150
+
151
+ To reproduce the 10-seed evaluation locally:
152
+
153
+ ```bash
154
+ # Install dependencies
155
+ uv sync --extra training
156
+
157
+ # Run evaluation (writes to outputs/ppo_hard_multi_eval.json)
158
+ uv run python train/eval_hard_multi.py
159
+ ```
160
+
161
+ Expected output: PPO mean = 0.691 ± 0.033 vs Heuristic mean = 0.608 ± 0.038,
162
+ win_rate = 1.0 (10/10 seeds), non-overlapping 95 % CIs.
163
+
164
+ > The deployed `inference.py` uses the LLM policy (Qwen2.5-72B + budget guard)
165
+ > as required by the hackathon specification. PPO was trained offline to
166
+ > validate environment depth and demonstrate that the task rewards genuine
167
+ > RL learning beyond reactive or in-context policies.
168
+
169
+ </details>
170
+
171
+ <details>
172
+ <summary>🔬 Reproducing LLM rigorous-stats Results (Optional)</summary>
173
+
174
+ ```bash
175
+ # Dev (seeds 0-9), full task suite
176
+ uv run python eval/eval_all.py \
177
+ --tasks easy --tasks medium --tasks hard --tasks hard_multi \
178
+ --policies heuristic --policies llm \
179
+ --seeds 10 --seed-set dev \
180
+ --out-dir outputs/freeze_check_alltasks_dev10
181
+
182
+ # Heldout (seeds 100-109), Hard_Multi
183
+ uv run python eval/eval_all.py \
184
+ --tasks hard_multi --policies heuristic --policies llm \
185
+ --seeds 10 --seed-set heldout \
186
+ --out-dir outputs/freeze_check_heldout10
187
+
188
+ # Fresh (seeds 200-209), Hard_Multi — uses --seed-values for arbitrary seeds
189
+ uv run python eval/eval_all.py \
190
+ --tasks hard_multi --policies heuristic --policies llm \
191
+ --seed-values "200,201,202,203,204,205,206,207,208,209" \
192
+ --out-dir outputs/freeze_check_fresh_200_209
193
+ ```
194
+
195
+ All three runs combined produce the n=30 rigorous-stats table above.
196
+ Episode-level JSON (per-step actions, rewards, sub-scores) is preserved
197
+ under each `outputs/freeze_check_*/` directory.
198
+
199
+ </details>
200
+
201
+ ## Why this benchmark has substance
202
+
203
+ - **Partial observability**: the agent-visible observation contains only `provider_a/b/c_status`, `budget_remaining`, `queue_backlog`, `system_latency`, and `step_count` (`budget_router/models.py`). True provider health is internal.
204
+ - **Non-stationarity**: task difficulty is created by explicit degradation schedules, culminating in Hard_Multi where A degrades from step 0 and B degrades from step 10 (`budget_router/tasks.py`).
205
+ - **Coupled constraints**: queue backlog amplifies latency, so routing errors create downstream SLA pressure rather than just local failures (`budget_router/environment.py`).
206
+ - **Meaningful evaluation**: the grader separately scores success, latency, budget, SLA, and adaptation; for Hard_Multi, adaptation is explicitly split across the two degradation windows (`budget_router/reward.py`).
207
+ - **RL learnability confirmed**: a PPO agent trained from scratch in 100k steps
208
+ achieves non-overlapping 95 % CIs above the heuristic on Hard_Multi
209
+ (`train/eval_hard_multi.py`), confirming the cascade signal is learnable
210
+ beyond reactive or in-context policies.
211
+ - **Anti-gaming, anti-overfitting tested**: 41 unit tests + 36 hard validation
212
+ assertions including degenerate-policy guards (always-A, always-B, always-shed
213
+ all dominated by baseline), grader-exploit guards (pure abstention scores
214
+ below 0.40 on Easy), heldout stability checks, and zero-NaN/zero-crash
215
+ invariants across 315 episodes.
216
+
217
+ ### Oracle–Baseline reward gap (verified, n=10 seeds each, dev set)
218
+
219
+ | Scenario | Oracle† | Heuristic | Gap | Signal |
220
+ |---|---|---|---|---|
221
+ | Easy | +10.10 | +6.98 | 3.12 (45 %) | Heuristic competitive |
222
+ | Medium | +9.49 | +2.53 | 6.96 (275 %) | Meaningful headroom |
223
+ | Hard | +6.54 | +0.88 | 5.66 (643 %) | Heuristic nearly fails |
224
+ | **Hard_Multi** | **+4.10** | **−2.97** | **7.07 (238 % of \|baseline\|)** | **Heuristic actively harmful** |
225
+
226
+ *† Oracle has privileged access to internal provider health — theoretical ceiling only, not a deployable policy.*
227
+
228
+ On Hard_Multi the heuristic reward goes negative (−2.97): the rule-based
229
+ policy exhausts budget mid-cascade and actively destroys episode value.
230
+ Oracle stays strongly positive (+4.10). The 7.07-point gap — 238 % above the
231
+ heuristic's absolute reward — is what produces the large advantage signal that
232
+ allows PPO to find a meaningful gradient in 100k steps and the LLM to find a
233
+ Cohen's-d ≈ 1.1 effect zero-shot.
234
+
235
+ ```mermaid
236
+ flowchart LR
237
+ subgraph Policy["Policy Layer"]
238
+ H["Heuristic"]
239
+ L["LLM (Qwen2.5-72B + budget guard)"]
240
+ P["PPO (SB3, Hard_Multi)"]
241
+ end
242
+
243
+ subgraph Env["BudgetRouterEnv (OpenEnv)"]
244
+ direction TB
245
+ O["Observation: provider_statuses, budget, backlog, latency, step"]
246
+ A["Actions: route_to_a, route_to_b, route_to_c, shed_load"]
247
+ R["Reward: success/fail + cost + SLA penalty, -10 on budget exhaustion"]
248
+ G["Episode grader: success, adaptation, latency, budget, SLA"]
249
+ O --> A --> R --> G
250
+ end
251
+
252
+ subgraph Tasks["Task presets"]
253
+ E["Easy"]
254
+ M["Medium"]
255
+ Hd["Hard"]
256
+ HM["Hard_Multi (cascade)"]
257
+ end
258
+
259
+ Policy -->|"action"| Env
260
+ Env -->|"obs + reward"| Policy
261
+ Tasks -->|"scenario config"| Env
262
+ ```
263
+
264
+ ## Tasks (what changes across difficulty)
265
+
266
+ | Task | Budget ($) | Degradation schedule |
267
+ |---|---:|---|
268
+ | Easy | 1.00 | None (`degradation_start_step=999`) |
269
+ | Medium | 0.95 | A degrades after step 5 (`rate=0.15`) |
270
+ | Hard | 0.85 | A degrades from step 0 (`rate=0.15`) |
271
+ | Hard_Multi | 1.10 | A degrades from step 0 (`rate=0.12`), then B from step 10 (`rate=0.10`) |
272
+
273
+ Hard_Multi is the headline scenario: once B starts degrading at step 10, C becomes the only consistently reliable option. Since `cost_c=$0.10/request`, the final 10 steps alone can consume `$1.00` of the `$1.10` budget, making **early budget conservation** a binding constraint.
274
+
275
+ ## Grader (episode score)
276
+
277
+ The episode grader is a weighted score in `[0,1]`:
278
+
279
+ `overall = 0.30·success + 0.20·latency + 0.15·budget + 0.15·SLA + 0.20·adaptation`
280
+
281
+ Notes (from `budget_router/reward.py`):
282
+
283
+ - `success_score` is computed over **all episode steps** (shed-load/abstention is penalized).
284
+ - `adaptation_score` evaluates post-degradation success. For Hard_Multi it is a blended window: 0.5×(after A degrades, before B) + 0.5×(after B degrades).
285
+
286
+ ## Evaluation protocol (reproducibility)
287
+
288
+ - **Three independent seed buckets**: dev (0–9) used during policy design;
289
+ heldout (100–109) used to falsify dev-seed overfitting; fresh (200–209)
290
+ added *after* the LLM and PPO were frozen to falsify "tuned-on-heldout"
291
+ concerns. See `eval/eval_all.py::SEED_SETS` and the `--seed-values` CLI
292
+ option for arbitrary seed lists.
293
+ - **Scripted runs**: `eval/eval_all.py` writes timestamped artifacts under
294
+ `outputs/`. Per-episode JSON includes per-step `actions`, `rewards`, and
295
+ the full grader sub-score breakdown.
296
+ - **Statistical reporting**: We report Cohen's d, paired Welch t-test,
297
+ bootstrap 95 % confidence intervals, IQM, and probability of improvement
298
+ in line with [Agarwal et al. 2021 (NeurIPS Outstanding Paper)](https://arxiv.org/abs/2108.13264)
299
+ and [Henderson et al. 2018](https://arxiv.org/abs/1709.06560)'s reproducibility
300
+ recommendations. Sample size n=30 (combined buckets) exceeds the Colas
301
+ et al. 2018 recommended power-analysis floor for our observed effect size.
302
+ - **Anti-cheating tests**: `budget_router/tests/test_environment.py::TestGraderSemantics`
303
+ verifies that pure abstention scores below 0.40 on Easy and that
304
+ partial abstention always scores worse than full service.
305
+
306
+ ## Getting started
307
+
308
+ 1. Install dependencies:
309
+
310
+ ```bash
311
+ uv sync
312
+ ```
313
+
314
+ 2. (Optional, for LLM policy) set an OpenAI-compatible endpoint:
315
+
316
+ ```bash
317
+ export API_BASE_URL=https://router.huggingface.co/v1
318
+ export MODEL_NAME=Qwen/Qwen2.5-72B-Instruct
319
+ export HF_TOKEN=... # or API_KEY
320
+ ```
321
+
322
+ 3. Run evaluation (writes to `outputs/`):
323
+
324
+ ```bash
325
+ # Single-task heldout reproduction
326
+ uv run python eval/eval_all.py \
327
+ --tasks hard_multi --seed-set heldout --seeds 10 \
328
+ --policies heuristic --policies llm \
329
+ --out-dir outputs/heldout_repro
330
+
331
+ # Full task suite, dev
332
+ uv run python eval/eval_all.py \
333
+ --tasks easy --tasks medium --tasks hard --tasks hard_multi \
334
+ --policies heuristic --policies llm \
335
+ --seeds 10 --seed-set dev \
336
+ --out-dir outputs/dev_repro
337
+ ```
338
+
339
+ ## References
340
+
341
+ - Altman (1999): *Constrained Markov Decision Processes*.
342
+ - Henderson, Islam, Bachman, Pineau, Precup, Meger ([arXiv:1709.06560](https://arxiv.org/abs/1709.06560), AAAI 2018): *Deep Reinforcement Learning that Matters* — foundational reproducibility study; motivated multi-bucket seed evaluation here.
343
+ - Colas, Sigaud, Oudeyer ([arXiv:1806.08295](https://arxiv.org/abs/1806.08295), 2018): *How Many Random Seeds? Statistical Power Analysis in Deep RL Experiments* — power-analysis basis for n=30.
344
+ - Agarwal, Schwarzer, Castro, Courville, Bellemare ([arXiv:2108.13264](https://arxiv.org/abs/2108.13264), NeurIPS 2021 Outstanding Paper): *Deep RL at the Edge of the Statistical Precipice* — IQM, bootstrap CIs, probability-of-improvement adopted in the statistical-evidence table.
REPRODUCIBILITY.md ADDED
@@ -0,0 +1,406 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Budget Router Reproducibility Guide
2
+
3
+ This guide is a Pareto-optimal falsification checklist for Budget Router. Its goal is not to run every possible experiment; it is to quickly answer the questions most likely to invalidate the project claims:
4
+
5
+ - Does the environment still behave like the source describes?
6
+ - Does the grader still resist reward gaming and abstention exploits?
7
+ - Does the heuristic remain a real baseline rather than a degenerate trick?
8
+ - Does the LLM policy beat the heuristic for the right reasons, not just prompt or seed overfitting?
9
+ - Does PPO still demonstrate learnability beyond reactive heuristics on `hard_multi`?
10
+
11
+ Use the active `README.md` only as a claim surface and intuition source. The source of truth is the code: `budget_router/environment.py`, `budget_router/reward.py`, `budget_router/policies.py`, `budget_router/tasks.py`, `inference.py`, `eval/eval_all.py`, `eval/trace_episode.py`, `budget_router/validation.py`, and the tests under `budget_router/tests/`. Do not use archived README files for this analysis.
12
+
13
+ ## Mental Model
14
+
15
+ Budget Router is a partially observable routing environment. A policy chooses one of:
16
+
17
+ - `route_to_a`
18
+ - `route_to_b`
19
+ - `route_to_c`
20
+ - `shed_load`
21
+
22
+ The policy sees normalized public observations only: provider rolling success estimates, remaining budget, queue backlog, latency, and progress. It does not see true provider health. Provider status `0.5` means unprobed/unknown, not healthy.
23
+
24
+ The environment has two scoring layers:
25
+
26
+ - Step reward in `budget_router/reward.py::step_reward`: dense learning signal with success/failure, cost, SLA penalty, and a catastrophic budget-exhaustion path in `BudgetRouterEnv.step`.
27
+ - Episode grader in `budget_router/reward.py::grade_episode`: semantic benchmark score in `[0, 1]` using success, latency, budget, SLA, and adaptation.
28
+
29
+ This distinction matters. Reward hacking usually appears when a policy optimizes a shaped reward or loophole that does not match the semantic grader. The most important checks below are designed to catch that quickly.
30
+
31
+ ## The 20-30% Command Ladder
32
+
33
+ Run these in order when you want high confidence fast. Stop at the first failure and inspect before spending tokens or API calls on larger experiments.
34
+
35
+ ### 1. Install the Base Environment
36
+
37
+ ```bash
38
+ uv sync
39
+ ```
40
+
41
+ Why: this is the minimal dependency set for unit tests, heuristic policy checks, environment validation, and non-LLM traces. It does not require API keys or training dependencies.
42
+
43
+ Red flags:
44
+
45
+ - dependency resolution fails
46
+ - imports fail for `openenv_core`, `typer`, or local `budget_router`
47
+ - tests below require hidden setup not documented in code
48
+
49
+ ### 2. Run the Unit and Regression Tests
50
+
51
+ ```bash
52
+ uv run pytest budget_router/tests
53
+ ```
54
+
55
+ Why: this is the fastest broad guardrail. It covers deterministic resets, observation bounds, reward sanity, anti-abstention grader semantics, `hard_multi` adaptation windows, seed selection, LLM prompt structure, trace output shape, and GRPO reward behavior.
56
+
57
+ Highest-value test areas:
58
+
59
+ - `test_environment.py::TestGraderSemantics`: catches reward gaming by always shedding or partially abstaining.
60
+ - `test_environment.py::TestBehavioralGuards`: catches heuristic budget-exhaustion regressions on `hard_multi`.
61
+ - `test_eval_all_seed_selection.py`: catches seed-bucket drift and explicit fresh-seed parsing regressions.
62
+ - `test_inference_prompt.py`: catches LLM prompt regressions around budget runway, noise calibration, task name, and bankruptcy warnings.
63
+ - `test_grpo_training_reward.py`: catches GRPO reward mistakes where incomplete episodes get full grader credit.
64
+
65
+ Red flags:
66
+
67
+ - pure abstention scores too high
68
+ - partial abstention beats full service
69
+ - `hard_multi` adaptation ignores the secondary degradation window
70
+ - explicit seeds no longer override named seed sets
71
+ - LLM prompt loses `0.500 = unobserved`, budget runway, or bankruptcy constraints
72
+ - GRPO partial episodes get the full episode grader
73
+
74
+ ### 3. Run No-API Environment Validation
75
+
76
+ ```bash
77
+ uv run python -m budget_router.validation
78
+ ```
79
+
80
+ Why: this compares random, heuristic, oracle, and degenerate policies across tasks and seed sets without calling an LLM. It is the best single command for environment validity, reward-gaming resistance, and oracle-vs-baseline headroom.
81
+
82
+ What it checks from source:
83
+
84
+ - `random_policy`: lower-bound behavior.
85
+ - `heuristic_baseline_policy`: public-observation, cheapest-viable baseline.
86
+ - `debug_upper_bound_policy`: oracle/debug policy with privileged internal health access.
87
+ - degenerate policies: always A, always B, always C, always shed.
88
+ - hard assertions: baseline beats random on core tasks, oracle beats baseline, degenerate policies do not all dominate, heldout behavior is stable, rewards are not NaN, episodes do not exceed 20 steps.
89
+
90
+ How to interpret:
91
+
92
+ - Oracle above heuristic means the environment has exploitable headroom.
93
+ - Heuristic above random means the benchmark is not noise.
94
+ - Degenerate policies failing to dominate means the grader is not trivially gameable.
95
+ - Heldout stability means basic environment behavior is not seed-fragile.
96
+
97
+ Red flags:
98
+
99
+ - oracle no longer beats heuristic on any meaningful task
100
+ - random beats heuristic broadly outside the intentionally hard `hard_multi` caveat
101
+ - always shed or always C dominates the heuristic
102
+ - validation passes only because assertions were weakened
103
+ - reward means shift sharply without a corresponding intentional source change in `tasks.py`, `environment.py`, or `reward.py`
104
+
105
+ ### 4. Inspect Exact-Seed Behavior With Traces
106
+
107
+ Use traces when aggregate numbers move or when you suspect reward hacking. Start with heuristic because it is deterministic and no-API.
108
+
109
+ **Progress while the episode runs:** By default, `eval/trace_episode.py` prints nothing until the episode completes (then it prints the full table and optional JSON). For **~20 sequential LLM calls**, that can look “stuck.” Pass `**--verbose`** or `**-v**` to print one `**[trace]**` line per environment step as it happens (`step`, `action`, step `reward`, cumulative reward, `done`). For `**--policy llm**`, you also get a `**[trace] begin …**` line before the first network call, and `**llm_error=…**` when a step falls back after an API error.
110
+
111
+ ```bash
112
+ uv run python eval/trace_episode.py \
113
+ --task hard_multi \
114
+ --seed 3 \
115
+ --policy heuristic \
116
+ --verbose \
117
+ --output-json outputs/trace_heuristic_hard_multi_seed3.json
118
+ ```
119
+
120
+ If training extras are installed: use the bundled `trained_models/ppo_hard_multi_100k.zip`, or train from scratch first (overwrites the default save path used by the trace script):
121
+
122
+ ```bash
123
+ uv sync --extra training
124
+
125
+ # Recreate the checkpoint from scratch (optional if zip already present and trusted)
126
+ uv run python train/train_ppo_hard_multi.py
127
+
128
+ uv run python eval/trace_episode.py \
129
+ --task hard_multi \
130
+ --seed 3 \
131
+ --policy ppo \
132
+ --verbose \
133
+ --output-json outputs/trace_ppo_hard_multi_seed3.json
134
+ ```
135
+
136
+ If API credentials are configured:
137
+
138
+ ```bash
139
+ export API_BASE_URL="https://router.huggingface.co/v1"
140
+ export MODEL_NAME="Qwen/Qwen2.5-72B-Instruct"
141
+ export HF_TOKEN="<your-token>"
142
+
143
+ uv run python eval/trace_episode.py \
144
+ --task hard_multi \
145
+ --seed 3 \
146
+ --policy llm \
147
+ --verbose \
148
+ --output-json outputs/trace_llm_hard_multi_seed3.json
149
+ ```
150
+
151
+ Why: After the episode, `eval/trace_episode.py` prints the public observation before each action plus action, provider, success, reward, cumulative reward, cost, budget, latency, and final grader breakdown. With `**--verbose**`, you also see **per-step progress during** the run (recommended for LLM). This is the fastest way to see whether a policy is actually adapting or merely exploiting a scoring artifact.
152
+
153
+ Red flags:
154
+
155
+ - policy sheds many steps but grader remains high
156
+ - policy burns budget early and still scores well
157
+ - policy never probes unknown providers but appears to infer hidden health
158
+ - LLM repeatedly switches on one noisy failure despite the prompt's noise calibration
159
+ - PPO repeatedly chooses a degenerate sequence such as always C or always shed
160
+ - traces expose hidden provider health to the acting policy; the trace may display evidence after the fact, but policy inputs should remain public observations
161
+
162
+ ### 5. Reproduce Heuristic vs LLM Claims by Seed Bucket
163
+
164
+ Set credentials only for LLM runs:
165
+
166
+ ```bash
167
+ export API_BASE_URL="https://router.huggingface.co/v1"
168
+ export MODEL_NAME="Qwen/Qwen2.5-72B-Instruct"
169
+ export HF_TOKEN="<your-token>"
170
+ ```
171
+
172
+ Dev full-suite check:
173
+
174
+ ```bash
175
+ uv run python eval/eval_all.py \
176
+ --tasks easy --tasks medium --tasks hard --tasks hard_multi \
177
+ --policies heuristic --policies llm \
178
+ --seeds 10 \
179
+ --seed-set dev \
180
+ --out-dir outputs/repro_dev_alltasks
181
+ ```
182
+
183
+ Heldout `hard_multi` check:
184
+
185
+ ```bash
186
+ uv run python eval/eval_all.py \
187
+ --tasks hard_multi \
188
+ --policies heuristic --policies llm \
189
+ --seeds 10 \
190
+ --seed-set heldout \
191
+ --out-dir outputs/repro_heldout_hard_multi
192
+ ```
193
+
194
+ Fresh arbitrary-seed check:
195
+
196
+ ```bash
197
+ uv run python eval/eval_all.py \
198
+ --tasks hard_multi \
199
+ --policies heuristic --policies llm \
200
+ --seed-values "200,201,202,203,204,205,206,207,208,209" \
201
+ --out-dir outputs/repro_fresh_200_209_hard_multi
202
+ ```
203
+
204
+ Why: `eval/eval_all.py` writes timestamped JSON and Markdown summaries. Its seed logic has explicit named buckets for `dev` and `heldout`, plus `--seed-values` for arbitrary fresh seeds. Fresh seeds are the main defense against "tuned on heldout" critiques.
205
+
206
+ How to interpret:
207
+
208
+ - Dev is useful for smoke and comparison with existing README claims.
209
+ - Heldout is the first real overfitting check.
210
+ - Fresh seeds are the strongest quick falsifier of prompt/guard overfitting.
211
+ - Compare paired seeds, not just aggregate means; LLM and heuristic should be evaluated on the same seeds.
212
+
213
+ Red flags:
214
+
215
+ - LLM only wins on dev and collapses on heldout/fresh
216
+ - LLM improvement comes mostly from one outlier seed
217
+ - LLM loses the `hard_multi` adaptation sub-score while gaining budget score via excessive shedding
218
+ - LLM invalid outputs are silently converted to `shed_load` too often
219
+ - API/model changes make results incomparable without recording `MODEL_NAME`, endpoint, date, and prompt mode
220
+
221
+ Optional raw LLM audit:
222
+
223
+ ```bash
224
+ LLM_LOG_RAW=1 LLM_LOG_RAW_MAX_CHARS=400 \
225
+ uv run python eval/eval_all.py \
226
+ --tasks hard_multi \
227
+ --policies heuristic --policies llm \
228
+ --seed-values "200,201,202" \
229
+ --out-dir outputs/repro_llm_raw_audit
230
+ ```
231
+
232
+ Why: this helps distinguish real policy behavior from parser/guard artifacts. The parser in `inference.py` extracts a valid action string when present and falls back to `shed_load` when parsing fails.
233
+
234
+ ### 6. Evaluate the Included PPO Hard_Multi Policy
235
+
236
+ ```bash
237
+ uv sync --extra training
238
+
239
+ uv run python train/eval_hard_multi.py
240
+ ```
241
+
242
+ Why: this is the source-backed PPO comparison path for `hard_multi`. It loads `trained_models/ppo_hard_multi_100k.zip`, evaluates deterministic PPO on seeds `0-9`, evaluates the heuristic on the same seeds, reports mean/std/95% CI/win rate/subscores, and writes `outputs/ppo_hard_multi_eval.json`.
243
+
244
+ Red flags:
245
+
246
+ - model file is missing
247
+ - PPO no longer beats heuristic on most paired seeds
248
+ - PPO wins only by budget preservation while success/adaptation collapse
249
+ - PPO traces reveal degenerate always-action behavior
250
+ - PPO results are compared against a different seed set than heuristic
251
+
252
+ Important limitation: `eval/eval_all.py` accepts `--policies ppo` but currently only warns that PPO is not wired there. Use `train/eval_hard_multi.py` or `eval/trace_episode.py --policy ppo` (optional `--verbose`) for PPO evidence.
253
+
254
+ ### 7. Retrain PPO Only When You Need to Revalidate Learnability
255
+
256
+ ```bash
257
+ uv sync --extra training
258
+
259
+ uv run python train/train_ppo_hard_multi.py
260
+
261
+ uv run python train/eval_hard_multi.py
262
+ ```
263
+
264
+ Why: training is expensive relative to the other checks. Run it when source changes touch `environment.py`, `reward.py`, `tasks.py`, `train/gym_wrapper.py`, or PPO hyperparameters. The current training script uses Stable-Baselines3 PPO, `MlpPolicy`, 4 parallel envs, 100k steps, and saves `trained_models/ppo_hard_multi_100k.zip`.
265
+
266
+ Red flags:
267
+
268
+ - PPO cannot improve after training
269
+ - training reward improves but grader does not
270
+ - policy learns to terminate early or exploit budget scoring
271
+ - learned behavior is strong on dev seeds but weak on exact fresh traces
272
+
273
+ ### 8. GRPO/Tool-Calling Smoke Checks
274
+
275
+ Use this only if you are touching GRPO/training-wrapper code:
276
+
277
+ ```bash
278
+ ## blocked for now till we fix GRPO
279
+ #uv sync --extra grpo
280
+
281
+ #PYTORCH_ENABLE_MPS_FALLBACK=1 uv run python train/smoke_test.py
282
+ ```
283
+
284
+ Why: this validates model-to-tool-to-environment-to-reward plumbing. It is not evidence of learning. The unit tests around `train/grpo_env.py` and `train/learn_experiment.py` are more important for reward correctness.
285
+
286
+ Red flags:
287
+
288
+ - model makes no tool calls and receives nonzero reward
289
+ - incomplete episodes receive full grader score
290
+ - tool wrapper constructs custom history instead of delegating to `BudgetRouterEnv.step`
291
+ - action-sequence diversity collapses before learning is expected
292
+
293
+ ## Policy Definitions
294
+
295
+ Heuristic policy:
296
+
297
+ - Defined in `budget_router/policies.py::heuristic_baseline_policy`.
298
+ - Uses only public `Observation`.
299
+ - Chooses the cheapest provider with status above `0.52` or unprobed `0.5`.
300
+ - Applies a simple low-budget guard that excludes expensive C below `0.10` budget fraction.
301
+ - This is a reactive baseline, not an oracle.
302
+
303
+ Oracle/debug upper-bound policy:
304
+
305
+ - Defined in `budget_router/policies.py::debug_upper_bound_policy`.
306
+ - Uses privileged `InternalState`, including true provider health and remaining budget.
307
+ - It is validation-only and should never be presented as a deployable policy.
308
+ - Its purpose is to prove there is headroom above the public-observation heuristic.
309
+
310
+ LLM policy:
311
+
312
+ - Defined in `inference.py::LLMRouter`.
313
+ - Uses an OpenAI-compatible chat API.
314
+ - Prompt requires exactly one action string.
315
+ - Adds trend text, budget runway, task name, and optional previous-step feedback.
316
+ - Applies `_apply_budget_safety_guard`, which vetoes actions that would immediately exhaust public remaining budget.
317
+ - Parser fallback is `shed_load`; frequent fallback is a red flag, not a win.
318
+
319
+ PPO policy:
320
+
321
+ - Training path: `train/train_ppo_hard_multi.py`.
322
+ - Evaluation path: `train/eval_hard_multi.py`.
323
+ - Trace path: `eval/trace_episode.py --policy ppo` (optional `--verbose` / `-v` for per-step lines during the run).
324
+ - Gym wrapper: `train/gym_wrapper.py`.
325
+ - Current headline PPO scope is `hard_multi`, not all tasks.
326
+
327
+ Degenerate policies:
328
+
329
+ - Defined in `budget_router/policies.py`.
330
+ - Always A, always B, always C, always shed.
331
+ - These are not competitors; they are exploit detectors.
332
+
333
+ ## What Counts as "Results Still Stand"
334
+
335
+ The README claims are still credible only if the following all hold:
336
+
337
+ 1. Unit tests pass, especially grader semantics and seed-selection tests.
338
+ 2. `budget_router/validation.py` still shows non-triviality, oracle headroom, degenerate-policy resistance, heldout stability, no NaNs, and no >20-step episodes.
339
+ 3. Exact traces show plausible adaptation rather than abstention, parser fallback, or hidden-state leakage.
340
+ 4. LLM vs heuristic remains positive on paired heldout and fresh `hard_multi` seeds, not just dev.
341
+ 5. PPO evaluation through `train/eval_hard_multi.py` still beats heuristic on paired dev seeds if PPO claims are retained.
342
+ 6. Any material drift is reflected in `README.md`; do not preserve old claims if the source-backed commands contradict them.
343
+
344
+ ## Fast Failure Triage
345
+
346
+ If unit tests fail:
347
+
348
+ - Inspect `reward.py` first for grader regressions.
349
+ - Inspect `environment.py` next for step history, budget exhaustion, termination, observation bounds, and degradation timing.
350
+ - Inspect `tasks.py` if task difficulty or seed outcomes moved unexpectedly.
351
+
352
+ If validation fails:
353
+
354
+ - Compare random, heuristic, oracle, and degenerate rows.
355
+ - If degenerate policies dominate, the grader or task economics are probably gameable.
356
+ - If oracle has no headroom, the task is too easy or the oracle/health dynamics changed.
357
+ - If heuristic is unstable across seed sets, check degradation jitter and stochastic success paths.
358
+
359
+ If LLM results fail:
360
+
361
+ - Confirm `MODEL_NAME`, endpoint, prompt mode, and credentials.
362
+ - Run a one-seed trace with `--policy llm` (add `--verbose` so each step logs while waiting on the API).
363
+ - Enable `LLM_LOG_RAW=1` for a small seed slice.
364
+ - Check whether failures are reasoning failures, parser failures, safety-guard interventions, or API/model drift.
365
+
366
+ If PPO results fail:
367
+
368
+ - Confirm `trained_models/ppo_hard_multi_100k.zip` exists.
369
+ - Run `eval/trace_episode.py --policy ppo` (optionally `--verbose`) on a winning and losing seed.
370
+ - Check whether `train/gym_wrapper.py` observation/action mapping still matches `BudgetRouterEnv`.
371
+ - Retrain only after source-level checks pass.
372
+
373
+ ## Minimum Evidence Bundle for a PR or Submission
374
+
375
+ For a fast but serious evidence package, save outputs from:
376
+
377
+ ```bash
378
+ uv run pytest budget_router/tests
379
+
380
+ uv run python -m budget_router.validation
381
+
382
+ uv run python eval/trace_episode.py \
383
+ --task hard_multi \
384
+ --seed 3 \
385
+ --policy heuristic \
386
+ --verbose \
387
+ --output-json outputs/evidence_trace_heuristic_hard_multi_seed3.json
388
+
389
+ uv run python eval/eval_all.py \
390
+ --tasks hard_multi \
391
+ --policies heuristic --policies llm \
392
+ --seeds 10 \
393
+ --seed-set heldout \
394
+ --out-dir outputs/evidence_heldout_hard_multi
395
+
396
+ uv run python eval/eval_all.py \
397
+ --tasks hard_multi \
398
+ --policies heuristic --policies llm \
399
+ --seed-values "200,201,202,203,204,205,206,207,208,209" \
400
+ --out-dir outputs/evidence_fresh_hard_multi
401
+
402
+ uv sync --extra training
403
+ uv run python train/eval_hard_multi.py
404
+ ```
405
+
406
+ This bundle covers correctness, anti-gaming, environment validity, exact behavior, heldout/fresh LLM comparison, and PPO learnability. It is small enough to run before a merge, but broad enough to catch most ways the published claims could become false.
__init__.py ADDED
File without changes
app_gradio.py ADDED
@@ -0,0 +1,408 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Budget Router — Gradio Visualization Dashboard
3
+ Run: python app_gradio.py (launches on http://localhost:7860)
4
+ """
5
+ from __future__ import annotations
6
+
7
+ import math
8
+ import time
9
+ from typing import Dict, Optional, Tuple
10
+
11
+ import gradio as gr
12
+ from budget_router.environment import BudgetRouterEnv
13
+ from budget_router.models import Action, ActionType
14
+ from budget_router.tasks import TASK_PRESETS
15
+
16
+ from gradio_ui.config import MAX_STEPS as _MAX_STEPS, POLICY_CHOICES, SCENARIOS
17
+ from gradio_ui.policies import get_policy_runner
18
+ from gradio_ui.renderers import (
19
+ _kpi_grid,
20
+ render_incident_timeline,
21
+ render_side_panel,
22
+ render_grader_plot,
23
+ MISSION_SCORE_HELP,
24
+ MISSION_SCORE_LABEL,
25
+ _GRADER_PENDING,
26
+ _PROVIDER_EMPTY,
27
+ render_history_table_compare,
28
+ )
29
+ from gradio_ui.state import fresh_side_state, _observation_to_dict, record_step
30
+ from gradio_ui.theme import LIGHT_CSS, THEME
31
+
32
+ MAX_STEPS = _MAX_STEPS
33
+
34
+
35
+ # Compatibility: preserve module-level MAX_STEPS for callers.
36
+
37
+ # ─── UI Build ─────────────────────────────────────────────────────────────────
38
+
39
+ def build_app() -> gr.Blocks:
40
+
41
+ def _normalize_seed(seed: object, default: int = 42) -> int:
42
+ if seed is None:
43
+ return default
44
+ try:
45
+ val = float(seed) # type: ignore[arg-type]
46
+ except Exception:
47
+ return default
48
+ if math.isnan(val) or math.isinf(val):
49
+ return default
50
+ try:
51
+ return int(val)
52
+ except Exception:
53
+ return default
54
+
55
+ with gr.Blocks(title="Budget Router — Policy Comparison", theme=THEME, css=LIGHT_CSS) as demo:
56
+
57
+ left_state = gr.State(fresh_side_state())
58
+ right_state = gr.State(fresh_side_state())
59
+ run_state = gr.State({"running": False, "scenario": "easy", "seed": 42, "step": 0})
60
+
61
+ gr.Markdown(
62
+ "# Budget Router — Policy Comparison\n"
63
+ "_Select 2 policies · start episode · step or finish episode · compare outcomes_"
64
+ )
65
+
66
+ with gr.Row():
67
+ with gr.Column(scale=1):
68
+ left_title = gr.Markdown("## Policy A")
69
+ left_policy = gr.Dropdown(choices=POLICY_CHOICES, value=None, label="Select policy")
70
+ left_status = gr.Textbox(label="Status", interactive=False, lines=2)
71
+ left_providers = gr.HTML(_PROVIDER_EMPTY())
72
+ left_budget = gr.HTML("")
73
+ left_kpis = gr.HTML(
74
+ _kpi_grid(
75
+ [
76
+ ("Step", "—"),
77
+ ("Last action", "—"),
78
+ ("Latency (ms)", "—"),
79
+ ("Budget remaining", "—"),
80
+ ("Reward", "—"),
81
+ ("Adaptation", "—"),
82
+ ]
83
+ )
84
+ )
85
+ left_badges = gr.HTML("")
86
+ left_summary = gr.HTML(
87
+ _kpi_grid(
88
+ [
89
+ ("Failed %", "—"),
90
+ ("SLA breach %", "—"),
91
+ ("Avg latency (ms)", "—"),
92
+ ]
93
+ )
94
+ )
95
+
96
+ with gr.Column(scale=1):
97
+ right_title = gr.Markdown("## Policy B")
98
+ right_policy = gr.Dropdown(choices=POLICY_CHOICES, value=None, label="Select policy")
99
+ right_status = gr.Textbox(label="Status", interactive=False, lines=2)
100
+ right_providers = gr.HTML(_PROVIDER_EMPTY())
101
+ right_budget = gr.HTML("")
102
+ right_kpis = gr.HTML(
103
+ _kpi_grid(
104
+ [
105
+ ("Step", "—"),
106
+ ("Last action", "—"),
107
+ ("Latency (ms)", "—"),
108
+ ("Budget remaining", "—"),
109
+ ("Reward", "—"),
110
+ ("Adaptation", "—"),
111
+ ]
112
+ )
113
+ )
114
+ right_badges = gr.HTML("")
115
+ right_summary = gr.HTML(
116
+ _kpi_grid(
117
+ [
118
+ ("Failed %", "—"),
119
+ ("SLA breach %", "—"),
120
+ ("Avg latency (ms)", "—"),
121
+ ]
122
+ )
123
+ )
124
+
125
+ with gr.Row():
126
+ with gr.Column(scale=2):
127
+ gr.Markdown("### Episode Controls")
128
+ scenario_sel = gr.Radio(SCENARIOS, value="easy", label="Scenario")
129
+ seed_inp = gr.Number(value=42, label="Seed", precision=0)
130
+ start_btn = gr.Button("▶ Start Episode", variant="primary", interactive=False)
131
+ with gr.Row():
132
+ step_btn = gr.Button("→ Step", variant="secondary", interactive=False)
133
+ fast_btn = gr.Button("⚡ Fast-forward", interactive=False)
134
+ finish_btn = gr.Button("⏩ Finish Episode", interactive=False)
135
+
136
+ gr.Markdown(f"### {MISSION_SCORE_LABEL} (comparison)\n_{MISSION_SCORE_HELP}_")
137
+ grader_plot = gr.Plot()
138
+
139
+ with gr.Row(elem_classes=["episode-history-row"]):
140
+ with gr.Column(scale=1):
141
+ left_history_title = gr.Markdown("### Step History — Policy A")
142
+ left_history_tbl = gr.HTML(render_history_table_compare([]), elem_classes=["episode-history-table"])
143
+ with gr.Column(scale=1):
144
+ right_history_title = gr.Markdown("### Step History — Policy B")
145
+ right_history_tbl = gr.HTML(render_history_table_compare([]), elem_classes=["episode-history-table"])
146
+
147
+ with gr.Row():
148
+ with gr.Column(scale=1):
149
+ left_grade_title = gr.Markdown(f"### {MISSION_SCORE_LABEL} — Policy A")
150
+ left_grade = gr.HTML(_GRADER_PENDING())
151
+ with gr.Column(scale=1):
152
+ right_grade_title = gr.Markdown(f"### {MISSION_SCORE_LABEL} — Policy B")
153
+ right_grade = gr.HTML(_GRADER_PENDING())
154
+
155
+ gr.Markdown("### Incident Timeline")
156
+ incidents_html = gr.HTML(render_incident_timeline("easy"))
157
+
158
+ def _render_side(side: Dict, run: Dict, scenario_name: str) -> Tuple[str, str, str, str, str, str, str, str]:
159
+ return render_side_panel(side, run, scenario_name)
160
+
161
+ def _render_all(ls: Dict, rs: Dict, run: Dict) -> tuple:
162
+ scenario_name = str(run.get("scenario", "easy") or "easy")
163
+ l_out = _render_side(ls, run, scenario_name)
164
+ r_out = _render_side(rs, run, scenario_name)
165
+ plot = render_grader_plot(
166
+ ls.get("history", []) or [],
167
+ rs.get("history", []) or [],
168
+ left_name=str(ls.get("policy_name") or ""),
169
+ right_name=str(rs.get("policy_name") or ""),
170
+ )
171
+ incidents = render_incident_timeline(scenario_name)
172
+
173
+ running = bool(run.get("running", False))
174
+ btn_update = gr.update(interactive=running)
175
+ config_update = gr.update(interactive=(not running))
176
+ return (
177
+ ls,
178
+ rs,
179
+ run,
180
+ l_out[0],
181
+ l_out[1],
182
+ l_out[2],
183
+ l_out[3],
184
+ l_out[4],
185
+ l_out[5],
186
+ r_out[0],
187
+ r_out[1],
188
+ r_out[2],
189
+ r_out[3],
190
+ r_out[4],
191
+ r_out[5],
192
+ l_out[6],
193
+ r_out[6],
194
+ l_out[7],
195
+ r_out[7],
196
+ plot,
197
+ incidents,
198
+ config_update,
199
+ config_update,
200
+ config_update,
201
+ config_update,
202
+ config_update,
203
+ btn_update,
204
+ btn_update,
205
+ btn_update,
206
+ )
207
+
208
+ OUTPUTS = [
209
+ left_state,
210
+ right_state,
211
+ run_state,
212
+ left_status,
213
+ left_providers,
214
+ left_budget,
215
+ left_kpis,
216
+ left_badges,
217
+ left_summary,
218
+ right_status,
219
+ right_providers,
220
+ right_budget,
221
+ right_kpis,
222
+ right_badges,
223
+ right_summary,
224
+ left_history_tbl,
225
+ right_history_tbl,
226
+ left_grade,
227
+ right_grade,
228
+ grader_plot,
229
+ incidents_html,
230
+ left_policy,
231
+ right_policy,
232
+ scenario_sel,
233
+ seed_inp,
234
+ start_btn,
235
+ step_btn,
236
+ fast_btn,
237
+ finish_btn,
238
+ ]
239
+
240
+ GRADER_PLOT_IDX = OUTPUTS.index(grader_plot)
241
+
242
+ def _update_start_enabled(p1: Optional[str], p2: Optional[str], run: Dict):
243
+ left_name = str(p1 or "Policy A")
244
+ right_name = str(p2 or "Policy B")
245
+ running = bool((run or {}).get("running", False))
246
+ ok = (bool(p1) and bool(p2)) and (not running)
247
+ return (
248
+ gr.update(interactive=ok),
249
+ f"## {left_name}",
250
+ f"## {right_name}",
251
+ f"### Step History — {left_name}",
252
+ f"### Step History — {right_name}",
253
+ f"### {MISSION_SCORE_LABEL} — {left_name}",
254
+ f"### {MISSION_SCORE_LABEL} — {right_name}",
255
+ )
256
+
257
+ left_policy.change(
258
+ _update_start_enabled,
259
+ inputs=[left_policy, right_policy, run_state],
260
+ outputs=[start_btn, left_title, right_title, left_history_title, right_history_title, left_grade_title, right_grade_title],
261
+ )
262
+ right_policy.change(
263
+ _update_start_enabled,
264
+ inputs=[left_policy, right_policy, run_state],
265
+ outputs=[start_btn, left_title, right_title, left_history_title, right_history_title, left_grade_title, right_grade_title],
266
+ )
267
+
268
+ scenario_sel.change(lambda s: render_incident_timeline(s), inputs=[scenario_sel], outputs=[incidents_html])
269
+
270
+ def do_start(p1: str, p2: str, scenario: str, seed: Optional[float], _ls: Dict, _rs: Dict, _run: Dict):
271
+ ls = fresh_side_state()
272
+ rs = fresh_side_state()
273
+
274
+ seed_int = _normalize_seed(seed, default=42)
275
+
276
+ if not p1 or not p2:
277
+ run = {"running": False, "scenario": scenario, "seed": seed_int, "step": 0}
278
+ ls["status"] = "Select both policies to start."
279
+ rs["status"] = "Select both policies to start."
280
+ return _render_all(ls, rs, run)
281
+
282
+ runner_l, err_l = get_policy_runner(p1)
283
+ runner_r, err_r = get_policy_runner(p2)
284
+ if err_l or err_r or runner_l is None or runner_r is None:
285
+ ls["status"] = f"❌ {err_l}" if err_l else ""
286
+ rs["status"] = f"❌ {err_r}" if err_r else ""
287
+ run = {"running": False, "scenario": scenario, "seed": seed_int, "step": 0}
288
+ return _render_all(ls, rs, run)
289
+
290
+ env_l = BudgetRouterEnv()
291
+ env_r = BudgetRouterEnv()
292
+ obs_l = env_l.reset(seed=seed_int, scenario=scenario)
293
+ obs_r = env_r.reset(seed=seed_int, scenario=scenario)
294
+ try:
295
+ runner_l.reset(scenario)
296
+ except Exception:
297
+ pass
298
+ try:
299
+ runner_r.reset(scenario)
300
+ except Exception:
301
+ pass
302
+
303
+ ls.update(
304
+ {
305
+ "env": env_l,
306
+ "policy_name": p1,
307
+ "policy_runner": runner_l,
308
+ "obs": _observation_to_dict(obs_l),
309
+ "status": f"✅ Running · {p1}",
310
+ }
311
+ )
312
+ rs.update(
313
+ {
314
+ "env": env_r,
315
+ "policy_name": p2,
316
+ "policy_runner": runner_r,
317
+ "obs": _observation_to_dict(obs_r),
318
+ "status": f"✅ Running · {p2}",
319
+ }
320
+ )
321
+ run = {"running": True, "scenario": scenario, "seed": seed_int, "step": 0}
322
+ return _render_all(ls, rs, run)
323
+
324
+ def _apply_local_step(side: Dict, scenario_name: str, global_step: int) -> Dict:
325
+ if side.get("done"):
326
+ return side
327
+ env = side.get("env")
328
+ runner = side.get("policy_runner")
329
+ if env is None or runner is None:
330
+ side["done"] = True
331
+ side["status"] = "❌ Not initialized"
332
+ return side
333
+ try:
334
+ action_str = runner.choose_action(side.get("obs", {}) or {})
335
+ except Exception as exc:
336
+ side["done"] = True
337
+ side["status"] = f"❌ Policy error: {exc}"
338
+ return side
339
+
340
+ pre_obs = dict(side.get("obs", {}) or {})
341
+ obs_obj = env.step(Action(action_type=ActionType(action_str)))
342
+ obs = _observation_to_dict(obs_obj)
343
+ reward = float(obs.get("reward", 0.0) or 0.0)
344
+ meta = dict(obs.get("metadata", {}) or {})
345
+ done = bool(obs.get("done", False))
346
+ side["history"].append(record_step(global_step, action_str, obs, reward, meta, health_obs=pre_obs))
347
+ side["obs"] = obs
348
+ side["cumulative_reward"] = float(side.get("cumulative_reward", 0.0) or 0.0) + reward
349
+ side["done"] = done
350
+ side["status"] = "✅ Done" if done else str(side.get("status", ""))
351
+ return side
352
+
353
+ def do_step(ls: Dict, rs: Dict, run: Dict):
354
+ if not bool(run.get("running", False)):
355
+ return _render_all(ls, rs, run)
356
+ if int(run.get("step", 0) or 0) >= MAX_STEPS:
357
+ run["running"] = False
358
+ return _render_all(ls, rs, run)
359
+
360
+ next_step = int(run.get("step", 0) or 0) + 1
361
+ scenario = str(run.get("scenario", "easy") or "easy")
362
+
363
+ ls = _apply_local_step(ls, scenario, next_step)
364
+ rs = _apply_local_step(rs, scenario, next_step)
365
+ run["step"] = next_step
366
+
367
+ if next_step >= MAX_STEPS or (ls.get("done") and rs.get("done")):
368
+ run["running"] = False
369
+ return _render_all(ls, rs, run)
370
+
371
+ def _stream_to_end(ls: Dict, rs: Dict, run: Dict):
372
+ if not bool(run.get("running", False)):
373
+ yield _render_all(ls, rs, run)
374
+ return
375
+
376
+ frozen = _render_all(ls, rs, run)
377
+ frozen_grader_plot = frozen[GRADER_PLOT_IDX]
378
+
379
+ while bool(run.get("running", False)) and int(run.get("step", 0) or 0) < MAX_STEPS:
380
+ out = do_step(ls, rs, run)
381
+ ls, rs, run = out[0], out[1], out[2]
382
+ out_list = list(out)
383
+ out_list[GRADER_PLOT_IDX] = frozen_grader_plot
384
+ yield tuple(out_list)
385
+ time.sleep(0.12)
386
+ if not bool(run.get("running", False)):
387
+ break
388
+
389
+ yield _render_all(ls, rs, run)
390
+
391
+ def do_fast_forward(ls: Dict, rs: Dict, run: Dict):
392
+ yield from _stream_to_end(ls, rs, run)
393
+
394
+ def do_finish(ls: Dict, rs: Dict, run: Dict):
395
+ yield from _stream_to_end(ls, rs, run)
396
+
397
+ start_btn.click(do_start, inputs=[left_policy, right_policy, scenario_sel, seed_inp, left_state, right_state, run_state], outputs=OUTPUTS)
398
+ step_btn.click(do_step, inputs=[left_state, right_state, run_state], outputs=OUTPUTS)
399
+ fast_btn.click(do_fast_forward, inputs=[left_state, right_state, run_state], outputs=OUTPUTS)
400
+ finish_btn.click(do_finish, inputs=[left_state, right_state, run_state], outputs=OUTPUTS)
401
+
402
+ return demo
403
+
404
+
405
+ if __name__ == "__main__":
406
+ app = build_app()
407
+ app.queue()
408
+ app.launch(server_port=7860)
blog.md ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Budget Router: Teaching Agents to Survive Cascading API Failures Under Budget
2
+
3
+ Production AI systems do not fail politely.
4
+
5
+ An application may depend on several LLM or API providers, each with different cost, latency, and reliability profiles. One provider becomes flaky. Traffic shifts. The next fallback becomes overloaded or starts degrading. The system still has a budget, users still expect latency, and the router never sees the true internal health of the providers. It only sees noisy public signals: recent success rates, backlog, latency, and remaining budget.
6
+
7
+ That is the problem Budget Router is built to study.
8
+
9
+ Budget Router is an OpenEnv-compliant reinforcement learning environment where an agent routes each request to Provider A, B, C, or sheds load. A is cheap, B is moderate, C is reliable but expensive. The agent's job is not simply to pick the best provider now. It must preserve enough budget to survive what happens later.
10
+
11
+ The interesting case is `Hard_Multi`: Provider A degrades from the beginning, and Provider B cascades later in the episode. This creates a two-phase incident. A naive router can look reasonable early and still fail late because it spent too much budget before the real cascade arrived.
12
+
13
+ This is a small environment, but it captures a real infrastructure question:
14
+
15
+ > Can an agent learn budget-aware reliability behavior under partial observability and non-stationary provider degradation?
16
+
17
+ ## TL;DR
18
+
19
+ Budget Router is not a claim that a 20-step toy simulation is production routing. It is a compact, reproducible benchmark for a production-shaped failure mode: budgeted API routing under cascading degradation.
20
+
21
+ On the headline `Hard_Multi` task, we compare three policy families:
22
+
23
+ | Policy | What it is | Hard_Multi grader | Main takeaway |
24
+ |---|---|---:|---|
25
+ | Heuristic | Hand-coded reactive baseline | ~0.61 | A real baseline, but brittle under cascade failure |
26
+ | Zero-shot LLM | Qwen2.5-72B with a deterministic budget guard | ~0.65 | In-context reasoning helps when observations are semantically meaningful |
27
+ | PPO | Small SB3 MLP trained on the environment | ~0.69 | The reward signal is learnable and stronger than hand rules |
28
+
29
+ ```mermaid
30
+ flowchart LR
31
+ H["Heuristic baseline<br/>0.61<br/>hand-coded rules"] --> L["Zero-shot LLM<br/>0.65<br/>Qwen2.5-72B + budget guard"]
32
+ L --> P["Trained PPO<br/>0.69<br/>SB3 MLP, 100k steps"]
33
+ ```
34
+
35
+ We also ran post-training experiments beyond PPO:
36
+
37
+ - SFT on Qwen2.5-1.5B via Hugging Face Jobs completed end-to-end, but did **not** beat the heuristic on the latest 10-seed evaluation: `0.577` vs `0.596`, with 3/10 wins.
38
+ - GRPO was attempted, but did not converge reliably in our setup.
39
+ - The negative result is useful: this environment rewards sequential credit assignment, probing, recovery, and budget conservation. Plain behavioral cloning can imitate action patterns without learning why those actions matter.
40
+
41
+ ![Budget Router evidence](figures/budget_router_evidence.png)
42
+
43
+ *Figure: README evidence summary. The strongest claims are the three-policy ordering on `Hard_Multi`, heldout/fresh seed generalization for the LLM, and adaptation-score gains over the reactive heuristic.*
44
+
45
+ ## The Environment
46
+
47
+ Budget Router exposes a simple action space:
48
+
49
+ - `route_to_a`
50
+ - `route_to_b`
51
+ - `route_to_c`
52
+ - `shed_load`
53
+
54
+ The observation is intentionally public and partial. The policy sees:
55
+
56
+ - rolling provider success estimates,
57
+ - remaining budget,
58
+ - queue backlog,
59
+ - system latency,
60
+ - episode progress.
61
+
62
+ It does **not** see the true hidden provider health. This makes the problem a partially observable decision problem rather than a lookup table. The agent has to infer whether a provider is actually degrading or whether it just saw noise.
63
+
64
+ The task suite escalates difficulty:
65
+
66
+ | Task | Degradation pattern | Why it matters |
67
+ |---|---|---|
68
+ | `Easy` | No degradation | Budget-conservative rules are hard to beat |
69
+ | `Medium` | A degrades after step 5 | Reactive switching begins to matter |
70
+ | `Hard` | A degrades from step 0 | Early adaptation matters |
71
+ | `Hard_Multi` | A degrades from step 0, B from step 10 | Cascade failure forces budget-aware anticipation |
72
+
73
+ `Hard_Multi` is the core benchmark. If the router burns money on expensive fallbacks too early, it may have no budget left when B starts failing. If it stays cheap for too long, it loses success and SLA. If it sheds load too often, it avoids cost but fails the user.
74
+
75
+ That is the point: there is no single dominant action.
76
+
77
+ ## The Grader
78
+
79
+ The episode grader is a weighted score in `[0, 1]`:
80
+
81
+ ```text
82
+ overall = 0.30 * success
83
+ + 0.20 * latency
84
+ + 0.15 * budget
85
+ + 0.15 * SLA
86
+ + 0.20 * adaptation
87
+ ```
88
+
89
+ The grader is designed so that obvious reward hacks are unattractive:
90
+
91
+ | Shortcut | Why it fails |
92
+ |---|---|
93
+ | Always route to C | Good latency, but expensive and budget-risky |
94
+ | Always shed load | Avoids cost, but earns no success or adaptation |
95
+ | Always use A | Cheap, but collapses once A degrades |
96
+ | Switch only after failure | Too late in `Hard_Multi`, because budget and latency errors compound |
97
+
98
+ This is best understood as a soft-constraint MDP. Budget and SLA pressure are real and measured, but they are encoded through reward terms rather than enforced through a full constrained-MDP Lagrangian. That distinction matters. The environment is honest about tradeoffs instead of pretending the constraint design is solved.
99
+
100
+ ## What Worked
101
+
102
+ ### 1. The heuristic is a real baseline, not a strawman
103
+
104
+ The heuristic uses public observations and chooses the cheapest viable provider. It is budget-aware and reactive. On easy settings, this is exactly the kind of policy that should be strong.
105
+
106
+ That is important for judge trust. If a learned policy only beats random or a broken baseline, the environment is not very informative. Budget Router's baseline is good enough to make improvement nontrivial, but limited enough that cascade failure exposes its weakness.
107
+
108
+ On `Hard_Multi`, the heuristic reaches roughly `0.61`. It is not useless; it is just too reactive for a delayed cascade.
109
+
110
+ ### 2. Zero-shot LLM routing improves because the state is semantically meaningful
111
+
112
+ The LLM policy is not trained on Budget Router. It receives structured observations with meaningful field names:
113
+
114
+ ```text
115
+ provider_a_status: 0.42
116
+ budget_remaining: 0.31
117
+ queue_backlog: 0.20
118
+ system_latency: 0.55
119
+ step_count: 0.60
120
+ ```
121
+
122
+ That matters. A language model can reason about "budget remaining," "provider status," and "latency" without gradient updates. The prompt also includes practical routing guidance: do not treat an unprobed `0.500` status as confirmed health, pay attention to trends, and avoid bankruptcy.
123
+
124
+ The production-facing LLM policy includes a deterministic budget-safety guard. This is not hidden. It is a deliberate agentic-system pattern: use the model for high-level routing judgment, and use deterministic code for arithmetic-critical safety. Without this guard, raw LLM behavior can sometimes spend itself into the budget cliff.
125
+
126
+ On the README's combined `Hard_Multi` evaluation, the LLM improves over the heuristic across dev, heldout, and fresh seed buckets. The important claim is not that the LLM is magical. The claim is that semantically self-describing environments let a foundation model bring useful priors to a new control problem.
127
+
128
+ ### 3. PPO proves the environment is learnable
129
+
130
+ PPO is a small neural policy trained directly on environment interaction. It is not an LLM, and it is not the post-training story. Its role is scientific: if a small policy gradient method can improve over the heuristic, the reward signal has enough structure to optimize.
131
+
132
+ The PPO policy uses the same environment mechanics through a Gym wrapper. The wrapper converts OpenEnv-style typed observations into arrays for Stable-Baselines3, but PPO still routes through the same `BudgetRouterEnv.step()` dynamics and grader.
133
+
134
+ On `Hard_Multi`, PPO reaches roughly `0.69` and beats the heuristic across the reported seeds. The adaptation sub-score is the clearest mechanism: PPO learns to preserve budget early and route more effectively when the cascade arrives.
135
+
136
+ The honest limitation is that PPO sees `step_count`. In a fixed 20-step task, it may learn a schedule keyed partly to the clock: switch away from A early, prepare for B around step 10. That is still useful environment-validation evidence, but it is not the same as proving open-ended reactive reasoning. The LLM result is the stronger evidence for in-context reactive use of semantic observations.
137
+
138
+ ## What Did Not Work
139
+
140
+ The post-training experiments are just as important as the wins.
141
+
142
+ ### SFT: the pipeline worked, the policy did not improve enough
143
+
144
+ We built a full supervised fine-tuning pipeline:
145
+
146
+ 1. Generate trajectories from a stronger teacher policy.
147
+ 2. Convert observations and actions into chat-style training examples.
148
+ 3. Push the dataset to Hugging Face.
149
+ 4. Train a LoRA adapter on `Qwen/Qwen2.5-1.5B-Instruct` using Hugging Face Jobs.
150
+ 5. Merge and push the model.
151
+ 6. Evaluate against the heuristic baseline.
152
+
153
+ The operational pipeline worked. The HF Jobs flow trained and evaluated the model on GPU infrastructure. This matters for reproducibility: the fine-tuning path is not a sketch; it is runnable through `generate_sft_data.py`, `train_sft.py`, `eval_sft.py`, and `scripts/submit_sft_hf_jobs.sh`.
154
+
155
+ But the latest SFT evaluation did not beat the heuristic. On 10 `Hard_Multi` seeds, SFT scored `0.577` vs heuristic `0.596`, winning 3/10 seeds.
156
+
157
+ That is not a result to hide. It is the most useful negative result in the project.
158
+
159
+ The likely reason is that behavioral cloning sees only good-looking actions, not the counterfactuals. It can learn "route to B often" or "avoid C when budget is low," but it does not directly learn why a near-miss action is bad, how budget errors compound, or when probing is worth the short-term risk.
160
+
161
+ In Budget Router, the objective is episodic. One bad switch can erase a good early trajectory. A static label does not carry the full consequence of that decision.
162
+
163
+ ### GRPO: promising direction, not a successful result yet
164
+
165
+ We also attempted GRPO-style reward optimization for an LLM policy. That is the more natural post-training direction for an OpenEnv agent, because the model can interact with the environment and receive reward from actual consequences.
166
+
167
+ In our current run, GRPO did not produce a reliable improvement. The pitch notes reward trending downward, weak rollout quality, and mode collapse in the attempted setup. The practical lesson is that GRPO needs more than a valid environment wrapper. It needs enough reward variance, enough model capacity, stable rollouts, and careful exploration.
168
+
169
+ So the honest conclusion is:
170
+
171
+ > PPO shows the environment is learnable. Zero-shot LLM shows semantic observations are useful. SFT shows imitation alone is not enough. GRPO remains the right research direction, but not a claimed win in this submission.
172
+
173
+ ## Why This Is Still a Strong Result
174
+
175
+ The strongest version of Budget Router is not "we found one trick that wins." It is this:
176
+
177
+ ```mermaid
178
+ flowchart TD
179
+ E["OpenEnv environment<br/>partial observability + cascade failure"] --> G["Five-part grader<br/>success, latency, budget, SLA, adaptation"]
180
+ G --> B["Heuristic baseline<br/>cheap reactive policy"]
181
+ G --> L["Zero-shot LLM<br/>semantic reasoning + budget guard"]
182
+ G --> P["PPO<br/>reward-aware optimization"]
183
+ P --> S["SFT/GRPO attempts<br/>negative results and future direction"]
184
+ ```
185
+
186
+ Budget Router has the properties a useful post-training environment should have:
187
+
188
+ | Property | Evidence |
189
+ |---|---|
190
+ | Non-trivial | Heuristic beats random but leaves headroom; oracle gap is largest on `Hard_Multi` |
191
+ | Learnable | PPO improves over heuristic on the hardest task |
192
+ | Semantically agentic | Zero-shot LLM improves because observations are meaningful |
193
+ | Not trivially gameable | Always-shed and always-expensive policies are penalized |
194
+ | Reproducible | README and `REPRODUCIBILITY.md` describe seed buckets, traces, saved JSON, and command paths |
195
+ | Honest | SFT and GRPO attempts are reported without overstating them |
196
+
197
+ That combination is rare in hackathon environments. Many environments are easy to demo but hard to falsify. Budget Router is designed to be falsified: run the seeds, inspect the traces, compare sub-scores, and check whether improvement comes from adaptation rather than a loophole.
198
+
199
+ ## Reproducibility
200
+
201
+ The repo is structured so judges can inspect both aggregate results and exact behavior.
202
+
203
+ Key artifacts:
204
+
205
+ - `README.md`: headline benchmark tables and evidence figure.
206
+ - `REPRODUCIBILITY.md`: command checklist and falsification guide.
207
+ - `eval/eval_all.py`: heuristic vs LLM evaluation across task and seed buckets.
208
+ - `eval/trace_episode.py`: step-by-step episode traces.
209
+ - `train/eval_hard_multi.py`: PPO evaluation path.
210
+ - `generate_sft_data.py`: SFT dataset generation from teacher trajectories.
211
+ - `train_sft.py`: LoRA SFT training script for Hugging Face Jobs.
212
+ - `eval_sft.py`: SFT model evaluation against the heuristic.
213
+ - `scripts/submit_sft_hf_jobs.sh`: orchestration for data, training, and evaluation jobs.
214
+
215
+ For the SFT pipeline, the intended run looks like:
216
+
217
+ ```bash
218
+ export TEACHER_POLICY=ppo
219
+ export HF_JOB_FLAVOR=a10g-large
220
+ export HF_JOB_NAMESPACE=akshay4
221
+ export DATASET_REPO=akshay4/budget-router-sft-data
222
+ export OUTPUT_REPO=akshay4/budget-router-sft-qwen1.5b
223
+ export SFT_MODEL_REPO=$OUTPUT_REPO
224
+ export SFT_N_EPISODES=100
225
+ export SFT_TOP_FRACTION=0.30
226
+ export NUM_EPOCHS=3
227
+ export N_SEEDS=10
228
+
229
+ ./scripts/submit_sft_hf_jobs.sh
230
+ ```
231
+
232
+ The important point is not that this SFT model won. It did not. The important point is that the environment can produce training data, launch model training, push artifacts, and evaluate the resulting policy. That closes the environment-to-training-to-evaluation loop, even when the experimental result is negative.
233
+
234
+ ## The Research Lesson
235
+
236
+ Budget Router is a reminder that post-training methods should match the task.
237
+
238
+ For static classification, supervised fine-tuning may be enough. For sequential decision-making under budget constraints, static imitation is often too weak. The agent needs to learn from consequences: what happens after a risky fallback, what happens when it fails to probe, what happens when it saves budget early, and what happens when it arrives at the cascade with no runway left.
239
+
240
+ That is why PPO worked better than SFT here. PPO receives feedback from the environment. It optimizes the episode objective directly. The zero-shot LLM also performs well because it brings external priors about risk, cost, and reliability to a semantically described state.
241
+
242
+ The next research step is not to pretend SFT solved the problem. It is to use SFT as a warm start or distillation layer, then apply environment-aware RL with better rollout diversity and reward normalization.
243
+
244
+ ## Conclusion
245
+
246
+ Budget Router is an incident-commander environment for budgeted API reliability. It asks a simple question with real consequences:
247
+
248
+ > When providers degrade and budget is running out, can an agent adapt before the cascade breaks the system?
249
+
250
+ The answer from our experiments is nuanced:
251
+
252
+ - hand-coded rules are strong but brittle,
253
+ - zero-shot LLM reasoning helps when the observation schema is meaningful,
254
+ - PPO confirms the environment has a learnable reward signal,
255
+ - SFT and GRPO are not claimed wins, but they reveal where the hard part actually is.
256
+
257
+ That is the story we think is worth submitting: a reproducible environment, a real baseline, measurable improvement, and enough intellectual honesty that the failures make the benchmark more credible rather than less.
budget_router/__init__.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Budget Router Environment - package init."""
2
+
3
+ from .environment import BudgetRouterEnv
4
+ from .models import Action, ActionType, EnvState, Observation, TaskConfig
5
+ from .tasks import EASY, HARD, MEDIUM
6
+
7
+ __all__ = [
8
+ "BudgetRouterEnv",
9
+ "Action",
10
+ "ActionType",
11
+ "Observation",
12
+ "EnvState",
13
+ "TaskConfig",
14
+ "EASY",
15
+ "MEDIUM",
16
+ "HARD",
17
+ ]
budget_router/client.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import asdict
2
+ from typing import Any, Dict
3
+
4
+ from openenv_core import HTTPEnvClient
5
+ from openenv_core.client_types import StepResult
6
+
7
+ from .models import Action, EnvState, Observation
8
+
9
+
10
+ class BudgetRouterClient(HTTPEnvClient[Action, Observation]):
11
+ def _step_payload(self, action: Action) -> Dict[str, Any]:
12
+ return asdict(action)
13
+
14
+ def _parse_result(self, payload: Dict[str, Any]) -> StepResult[Observation]:
15
+ observation_payload = payload.get("observation", payload)
16
+ observation = Observation(
17
+ **observation_payload,
18
+ done=payload.get("done", observation_payload.get("done", False)),
19
+ reward=payload.get("reward", observation_payload.get("reward")),
20
+ metadata=observation_payload.get("metadata", payload.get("metadata", {})),
21
+ )
22
+ return StepResult(
23
+ observation=observation,
24
+ reward=observation.reward,
25
+ done=observation.done,
26
+ )
27
+
28
+ def _parse_state(self, payload: Dict[str, Any]) -> EnvState:
29
+ return EnvState(**payload)
budget_router/environment.py ADDED
@@ -0,0 +1,515 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Budget Router Environment — Core RL environment.
3
+
4
+ Extends openenv-core Environment base class with the standard
5
+ reset(), step(), state interface. Processes one request per step
6
+ through 3 providers under budget, latency, reliability, and
7
+ degradation constraints.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import json
13
+ import math
14
+ import random
15
+ import uuid
16
+ from typing import Any, Dict, Optional, Tuple
17
+
18
+ from openenv_core.env_server import Environment
19
+ from openenv_core.env_server.types import Action as OpenEnvAction
20
+
21
+ from .models import (
22
+ Action,
23
+ ActionType,
24
+ EnvState,
25
+ InternalState,
26
+ Observation,
27
+ ProviderState,
28
+ TaskConfig,
29
+ )
30
+ from .reward import grade_episode, step_reward
31
+ from .tasks import EASY
32
+
33
+ BACKLOG_LATENCY_PER_ITEM_MS = 8.0
34
+
35
+
36
+ def _reported_score(value: float) -> float:
37
+ return min(max(float(value), 0.001), 0.999)
38
+
39
+
40
+ class BudgetRouterEnv(Environment):
41
+ """
42
+ Incident Commander for Budgeted Tool/API Reliability.
43
+
44
+ An agent routes incoming requests to one of 3 providers (A, B, C)
45
+ or sheds load, under budget, latency, and reliability constraints.
46
+
47
+ Extends OpenEnv Environment base class with proper type parameters.
48
+
49
+ Interface:
50
+ reset(seed, scenario) -> Observation
51
+ step(action) -> Observation (reward in obs.reward, done in obs.done)
52
+ state -> EnvState
53
+ """
54
+
55
+ def __init__(self, emit_structured_logs: bool = False) -> None:
56
+ super().__init__()
57
+ self._internal: InternalState = InternalState()
58
+ self._config: TaskConfig = EASY
59
+ self._rng: random.Random = random.Random()
60
+ self._episode_id: str = ""
61
+ self._cumulative_reward: float = 0.0
62
+ self._emit_structured_logs = emit_structured_logs
63
+ self._episode_number = 0
64
+ self._current_seed: Optional[int] = None
65
+
66
+ def _emit_log(self, prefix: str, payload: Dict[str, Any]) -> None:
67
+ if self._emit_structured_logs:
68
+ print(f"{prefix} {json.dumps(payload)}", flush=True)
69
+
70
+ def _observation_payload(self, observation: Observation) -> Dict[str, float]:
71
+ return {
72
+ "provider_a_status": float(observation.provider_a_status),
73
+ "provider_b_status": float(observation.provider_b_status),
74
+ "provider_c_status": float(observation.provider_c_status),
75
+ "budget_remaining": float(observation.budget_remaining),
76
+ "queue_backlog": float(observation.queue_backlog),
77
+ "system_latency": float(observation.system_latency),
78
+ "step_count": float(observation.step_count),
79
+ }
80
+
81
+ # ─── OpenEnv interface ──────────────────────────────────────────────
82
+
83
+ def reset(
84
+ self,
85
+ seed: Optional[int] = None,
86
+ episode_id: Optional[str] = None,
87
+ scenario: Optional[TaskConfig] = None,
88
+ **kwargs: Any,
89
+ ) -> Observation:
90
+ """Reset the environment to initial state."""
91
+ config = scenario or kwargs.get("scenario", EASY)
92
+ if isinstance(config, str):
93
+ from .tasks import TASK_PRESETS
94
+
95
+ config = TASK_PRESETS.get(config, EASY)
96
+ self._config = config
97
+
98
+ # Seed the RNG
99
+ if seed is not None:
100
+ self._rng = random.Random(seed)
101
+ else:
102
+ self._rng = random.Random()
103
+
104
+ self._episode_id = episode_id or str(uuid.uuid4())
105
+ self._episode_number += 1
106
+ self._current_seed = seed
107
+ self._cumulative_reward = 0.0
108
+
109
+ # Initialize providers
110
+ providers = {
111
+ "A": ProviderState(
112
+ name="A",
113
+ base_reliability=config.reliability_a,
114
+ current_health=config.reliability_a,
115
+ cost_per_request=config.cost_a,
116
+ base_latency_ms=config.latency_a,
117
+ ),
118
+ "B": ProviderState(
119
+ name="B",
120
+ base_reliability=config.reliability_b,
121
+ current_health=config.reliability_b,
122
+ cost_per_request=config.cost_b,
123
+ base_latency_ms=config.latency_b,
124
+ ),
125
+ "C": ProviderState(
126
+ name="C",
127
+ base_reliability=config.reliability_c,
128
+ current_health=config.reliability_c,
129
+ cost_per_request=config.cost_c,
130
+ base_latency_ms=config.latency_c,
131
+ ),
132
+ }
133
+
134
+ # Resolve jittered degradation onsets for this episode
135
+ _j1 = (self._rng.randint(-config.degradation_start_jitter,
136
+ config.degradation_start_jitter)
137
+ if config.degradation_start_jitter > 0 else 0)
138
+ _j2 = (self._rng.randint(-config.secondary_degradation_start_jitter,
139
+ config.secondary_degradation_start_jitter)
140
+ if config.secondary_degradation_start_jitter > 0 else 0)
141
+ _actual_primary = max(0, config.degradation_start_step + _j1)
142
+ _actual_secondary = max(0, config.secondary_degradation_start_step + _j2)
143
+
144
+ self._internal = InternalState(
145
+ providers=providers,
146
+ budget_dollars=config.initial_budget,
147
+ initial_budget_dollars=config.initial_budget,
148
+ queue_backlog_count=0,
149
+ max_queue_backlog=config.max_queue_backlog,
150
+ last_latency_ms=config.latency_a, # initial non-zero latency
151
+ sla_ceiling_ms=config.sla_ceiling_ms,
152
+ current_step=0,
153
+ max_steps=config.max_steps,
154
+ episode_done=False,
155
+ history=[],
156
+ provider_window={"A": [], "B": [], "C": []},
157
+ window_size=5,
158
+ actual_degradation_start=_actual_primary,
159
+ actual_secondary_degradation_start=_actual_secondary,
160
+ )
161
+
162
+ observation = self._get_obs()
163
+ self._emit_log(
164
+ "[START]",
165
+ {
166
+ "task": self._config.name,
167
+ "seed": int(seed) if seed is not None else -1,
168
+ "episode": self._episode_number,
169
+ },
170
+ )
171
+ return observation
172
+
173
+ def step(
174
+ self,
175
+ action: OpenEnvAction,
176
+ timeout_s: Optional[float] = None,
177
+ **kwargs: Any,
178
+ ) -> Observation:
179
+ """
180
+ Execute one step: route a request or shed load.
181
+
182
+ Returns:
183
+ Observation with reward set, done flag, and metadata dict.
184
+ """
185
+ if self._internal.episode_done:
186
+ # Already done — return terminal observation
187
+ obs = self._get_obs()
188
+ obs.done = True
189
+ obs.reward = 0.0
190
+ return obs
191
+
192
+ if not isinstance(action, Action):
193
+ action = Action(
194
+ action_type=getattr(action, "action_type"),
195
+ metadata=getattr(action, "metadata", {}),
196
+ )
197
+
198
+ if not self._internal.providers:
199
+ self.reset(seed=self._current_seed, scenario=self._config)
200
+
201
+ self._internal.current_step += 1
202
+ action_type = action.action_type.value
203
+
204
+ # ── Apply degradation BEFORE processing the request ──
205
+ self._degrade()
206
+
207
+ # ── Process the action ──
208
+ step_info: Dict[str, Any] = {
209
+ "step": self._internal.current_step,
210
+ "action_type": action_type,
211
+ "sla_ceiling_ms": self._config.sla_ceiling_ms,
212
+ "initial_budget": self._internal.initial_budget_dollars,
213
+ "degradation_start_step": self._internal.actual_degradation_start,
214
+ "secondary_degradation_start_step": (self._internal.actual_secondary_degradation_start
215
+ if self._config.secondary_degradation_target else None),
216
+ }
217
+
218
+ if action_type == "shed_load":
219
+ # Shed load: no routing, flat penalty
220
+ reward = step_reward(
221
+ action_type="shed_load",
222
+ request_succeeded=False,
223
+ provider_cost=0.0,
224
+ initial_budget=self._internal.initial_budget_dollars,
225
+ latency_ms=0.0,
226
+ sla_ceiling_ms=self._config.sla_ceiling_ms,
227
+ )
228
+ # Queue pressure decreases slightly when shedding
229
+ self._internal.queue_backlog_count = max(
230
+ 0, self._internal.queue_backlog_count - 1
231
+ )
232
+ # Latency set to 0 for shed (no request processed)
233
+ self._internal.last_latency_ms = 0.0
234
+
235
+ step_info.update(
236
+ {
237
+ "request_succeeded": False,
238
+ "cost": 0.0,
239
+ "latency_ms": 0.0,
240
+ "reward": reward,
241
+ "provider": None,
242
+ "queue_overflow": False,
243
+ }
244
+ )
245
+
246
+ else:
247
+ # Route to a provider
248
+ provider_name = {"route_to_a": "A", "route_to_b": "B", "route_to_c": "C"}[
249
+ action_type
250
+ ]
251
+ provider = self._internal.providers[provider_name]
252
+ self._internal.probed_providers.add(provider_name)
253
+
254
+ # Deduct cost
255
+ cost = provider.cost_per_request
256
+ self._internal.budget_dollars -= cost
257
+
258
+ # Check budget exhaustion
259
+ if self._internal.budget_dollars <= 0:
260
+ self._internal.budget_dollars = max(0.0, self._internal.budget_dollars)
261
+ # Terminal penalty
262
+ reward = -10.0
263
+ self._internal.episode_done = True
264
+ self._internal.last_latency_ms = 0.0
265
+
266
+ step_info.update(
267
+ {
268
+ "request_succeeded": False,
269
+ "cost": cost,
270
+ "latency_ms": 0.0,
271
+ "reward": reward,
272
+ "provider": provider_name,
273
+ "queue_overflow": False,
274
+ "budget_exhausted": True,
275
+ }
276
+ )
277
+
278
+ self._internal.history.append(step_info)
279
+ self._cumulative_reward += reward
280
+
281
+ obs = self._get_obs()
282
+ obs.done = True
283
+ obs.reward = reward
284
+ obs.metadata = step_info
285
+ self._emit_log(
286
+ "[STEP]",
287
+ {
288
+ "step": self._internal.current_step,
289
+ "action": action_type,
290
+ "reward": float(reward),
291
+ "done": bool(obs.done),
292
+ "observation": self._observation_payload(obs),
293
+ },
294
+ )
295
+ self._emit_log(
296
+ "[END]",
297
+ {
298
+ "task": self._config.name,
299
+ "seed": int(self._current_seed) if self._current_seed is not None else -1,
300
+ "episode": self._episode_number,
301
+ "total_reward": round(float(self._cumulative_reward), 4),
302
+ "score": _reported_score(float(grade_episode(self._internal.history)["overall_score"])),
303
+ },
304
+ )
305
+ return obs
306
+
307
+ # Determine if request succeeds (based on current_health)
308
+ request_succeeded = self._rng.random() < provider.current_health
309
+ provider.total_requests += 1
310
+
311
+ # Update windowed tracking
312
+ window = self._internal.provider_window[provider_name]
313
+ window.append(request_succeeded)
314
+ if len(window) > self._internal.window_size:
315
+ window.pop(0)
316
+
317
+ if request_succeeded:
318
+ provider.successful_requests += 1
319
+
320
+ # Compute latency
321
+ base_lat = provider.base_latency_ms
322
+ noise = self._rng.gauss(0, self._config.latency_noise_std)
323
+ # Queue backlog amplifies latency multiplicatively.
324
+ # At max backlog (norm=1.0), latency increases by 50%.
325
+ # This makes queue_backlog a causally relevant observation
326
+ # by indirectly coupling it to reward via SLA breaches.
327
+ queue_norm = (
328
+ self._internal.queue_backlog_count / self._internal.max_queue_backlog
329
+ if self._internal.max_queue_backlog > 0 else 0.0
330
+ )
331
+ backlog_amplifier = 1.0 + 0.5 * queue_norm
332
+ # Failed requests have higher latency (timeout-like behavior)
333
+ if not request_succeeded:
334
+ actual_latency = (base_lat + abs(noise) + 200.0) * backlog_amplifier
335
+ else:
336
+ actual_latency = max(10.0, (base_lat + noise) * backlog_amplifier)
337
+ self._internal.last_latency_ms = actual_latency
338
+
339
+ # Queue backlog: failures increase pressure
340
+ queue_overflow = False
341
+ if not request_succeeded:
342
+ self._internal.queue_backlog_count = min(
343
+ self._internal.max_queue_backlog,
344
+ self._internal.queue_backlog_count + 2,
345
+ )
346
+ if (
347
+ self._internal.queue_backlog_count
348
+ >= self._internal.max_queue_backlog
349
+ ):
350
+ queue_overflow = True
351
+ else:
352
+ # Successful request drains queue slightly
353
+ self._internal.queue_backlog_count = max(
354
+ 0, self._internal.queue_backlog_count - 1
355
+ )
356
+
357
+ # Compute reward
358
+ reward = step_reward(
359
+ action_type=action_type,
360
+ request_succeeded=request_succeeded,
361
+ provider_cost=cost,
362
+ initial_budget=self._internal.initial_budget_dollars,
363
+ latency_ms=actual_latency,
364
+ sla_ceiling_ms=self._config.sla_ceiling_ms,
365
+ )
366
+
367
+ step_info.update(
368
+ {
369
+ "request_succeeded": request_succeeded,
370
+ "cost": cost,
371
+ "latency_ms": round(actual_latency, 2),
372
+ "reward": reward,
373
+ "provider": provider_name,
374
+ "queue_overflow": queue_overflow,
375
+ }
376
+ )
377
+
378
+ # ── Record history ──
379
+ self._internal.history.append(step_info)
380
+ self._cumulative_reward += reward
381
+
382
+ # ── Check episode end ──
383
+ if self._internal.current_step >= self._internal.max_steps:
384
+ self._internal.episode_done = True
385
+
386
+ # ── Build observation ──
387
+ obs = self._get_obs()
388
+ obs.done = self._internal.episode_done
389
+ obs.reward = reward
390
+ obs.metadata = step_info
391
+
392
+ self._emit_log(
393
+ "[STEP]",
394
+ {
395
+ "step": self._internal.current_step,
396
+ "action": action_type,
397
+ "reward": float(reward),
398
+ "done": bool(obs.done),
399
+ "observation": self._observation_payload(obs),
400
+ },
401
+ )
402
+
403
+ if obs.done:
404
+ self._emit_log(
405
+ "[END]",
406
+ {
407
+ "task": self._config.name,
408
+ "seed": int(self._current_seed) if self._current_seed is not None else -1,
409
+ "episode": self._episode_number,
410
+ "total_reward": round(float(self._cumulative_reward), 4),
411
+ "score": _reported_score(float(grade_episode(self._internal.history)["overall_score"])),
412
+ },
413
+ )
414
+
415
+ return obs
416
+
417
+ @property
418
+ def state(self) -> EnvState:
419
+ """OpenEnv-compatible state property."""
420
+ return EnvState(
421
+ episode_id=self._episode_id,
422
+ step_count=self._internal.current_step,
423
+ scenario_name=self._config.name,
424
+ is_done=self._internal.episode_done,
425
+ )
426
+
427
+ # ─── Internal methods ──────────────────────────────────────────────
428
+
429
+ def _get_obs(self) -> Observation:
430
+ """Convert internal state to normalized [0,1] observation."""
431
+ s = self._internal
432
+
433
+ # Provider status: 0.5 for unprobed (max uncertainty), windowed rate if probed
434
+ def _probed_status(name: str) -> float:
435
+ if name not in s.probed_providers:
436
+ return 0.5
437
+ return s.get_windowed_success_rate(name)
438
+
439
+ a_status = _probed_status("A")
440
+ b_status = _probed_status("B")
441
+ c_status = _probed_status("C")
442
+
443
+ # Budget: fraction remaining
444
+ if s.initial_budget_dollars > 0:
445
+ budget_frac = max(0.0, s.budget_dollars / s.initial_budget_dollars)
446
+ else:
447
+ budget_frac = 0.0
448
+
449
+ # Queue backlog: normalized
450
+ if s.max_queue_backlog > 0:
451
+ queue_norm = s.queue_backlog_count / s.max_queue_backlog
452
+ else:
453
+ queue_norm = 0.0
454
+
455
+ # Latency: normalized to SLA ceiling
456
+ if s.sla_ceiling_ms > 0:
457
+ latency_norm = s.last_latency_ms / s.sla_ceiling_ms
458
+ else:
459
+ latency_norm = 0.0
460
+
461
+ # Step progress
462
+ if s.max_steps > 0:
463
+ step_norm = s.current_step / s.max_steps
464
+ else:
465
+ step_norm = 0.0
466
+
467
+ return Observation(
468
+ provider_a_status=max(0.0, min(1.0, a_status)),
469
+ provider_b_status=max(0.0, min(1.0, b_status)),
470
+ provider_c_status=max(0.0, min(1.0, c_status)),
471
+ budget_remaining=max(0.0, min(1.0, budget_frac)),
472
+ queue_backlog=max(0.0, min(1.0, queue_norm)),
473
+ system_latency=max(0.0, min(1.0, latency_norm)),
474
+ step_count=max(0.0, min(1.0, step_norm)),
475
+ )
476
+
477
+ def _degrade(self) -> None:
478
+ """
479
+ Apply stochastic degradation to configured provider(s).
480
+
481
+ The target provider's health decreases based on:
482
+ - degradation_rate from the TaskConfig
483
+ - A small random perturbation
484
+ - Only triggers after actual_degradation_start (jittered per episode)
485
+ Supports secondary degradation for multi-provider scenarios.
486
+ """
487
+ config = self._config
488
+ step = self._internal.current_step
489
+
490
+ # Primary degradation
491
+ if step >= self._internal.actual_degradation_start:
492
+ target = config.degradation_target
493
+ provider = self._internal.providers.get(target)
494
+ if provider is not None:
495
+ noise = self._rng.gauss(0, 0.02)
496
+ health_reduction = config.degradation_rate + noise
497
+ provider.current_health = max(
498
+ 0.05,
499
+ provider.current_health - health_reduction,
500
+ )
501
+
502
+ # Secondary degradation (for multi-provider scenarios)
503
+ if (
504
+ config.secondary_degradation_target
505
+ and step >= self._internal.actual_secondary_degradation_start
506
+ ):
507
+ target = config.secondary_degradation_target
508
+ provider = self._internal.providers.get(target)
509
+ if provider is not None:
510
+ noise = self._rng.gauss(0, 0.02)
511
+ health_reduction = config.secondary_degradation_rate + noise
512
+ provider.current_health = max(
513
+ 0.05,
514
+ provider.current_health - health_reduction,
515
+ )
budget_router/models.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass, field
2
+ from enum import Enum
3
+ from typing import Any, Dict, List, Literal, Optional
4
+
5
+ from openenv_core.env_server.types import (
6
+ Action as BaseAction,
7
+ Observation as BaseObservation,
8
+ State as BaseState,
9
+ )
10
+
11
+
12
+ # =============================================================================
13
+ # Action — extends OpenEnv Action
14
+ # =============================================================================
15
+
16
+
17
+ class ActionType(str, Enum):
18
+ """The four possible routing actions."""
19
+
20
+ ROUTE_TO_A = "route_to_a"
21
+ ROUTE_TO_B = "route_to_b"
22
+ ROUTE_TO_C = "route_to_c"
23
+ SHED_LOAD = "shed_load"
24
+
25
+
26
+ @dataclass(kw_only=True)
27
+ class Action(BaseAction):
28
+ """
29
+ Agent action: route a request to a provider or shed load.
30
+
31
+ Extends OpenEnv Action (which provides `metadata` field).
32
+ """
33
+
34
+ action_type: Literal["route_to_a", "route_to_b", "route_to_c", "shed_load"]
35
+
36
+ def __post_init__(self) -> None:
37
+ if isinstance(self.action_type, str):
38
+ self.action_type = ActionType(self.action_type)
39
+
40
+
41
+ # =============================================================================
42
+ # Observation — extends OpenEnv Observation
43
+ # =============================================================================
44
+
45
+
46
+ @dataclass(kw_only=True)
47
+ class Observation(BaseObservation):
48
+ """
49
+ Agent-visible observation. ALL numeric fields are normalized to [0.0, 1.0].
50
+
51
+ Extends OpenEnv Observation (which provides `done`, `reward`, `metadata` fields).
52
+ """
53
+
54
+ # Provider health (recent success rates)
55
+ provider_a_status: float
56
+ provider_b_status: float
57
+ provider_c_status: float
58
+
59
+ # Resource state
60
+ budget_remaining: float
61
+ queue_backlog: float
62
+ system_latency: float
63
+
64
+ # Episode progress
65
+ step_count: float
66
+
67
+ def __post_init__(self) -> None:
68
+ for field_name in (
69
+ "provider_a_status",
70
+ "provider_b_status",
71
+ "provider_c_status",
72
+ "budget_remaining",
73
+ "queue_backlog",
74
+ "system_latency",
75
+ "step_count",
76
+ ):
77
+ setattr(self, field_name, max(0.0, min(1.0, getattr(self, field_name))))
78
+
79
+
80
+ # =============================================================================
81
+ # Internal State (raw units, for debugging / trace only)
82
+ # =============================================================================
83
+
84
+
85
+ @dataclass
86
+ class ProviderState:
87
+ """Internal state of a single provider in raw units."""
88
+
89
+ name: str
90
+ base_reliability: float # initial reliability [0, 1]
91
+ current_health: float # current health [0, 1]
92
+ cost_per_request: float # dollars
93
+ base_latency_ms: float # base latency in ms
94
+ total_requests: int = 0
95
+ successful_requests: int = 0
96
+
97
+ @property
98
+ def observed_success_rate(self) -> float:
99
+ """Success rate from agent's perspective (windowed)."""
100
+ if self.total_requests == 0:
101
+ return self.base_reliability
102
+ return self.successful_requests / self.total_requests
103
+
104
+
105
+ @dataclass
106
+ class InternalState:
107
+ """
108
+ Full internal state in raw units. NOT exposed to the agent.
109
+ Used for manual trace, debugging, and the oracle policy.
110
+ """
111
+
112
+ providers: Dict[str, ProviderState] = field(default_factory=dict)
113
+ budget_dollars: float = 0.0
114
+ initial_budget_dollars: float = 0.0
115
+ queue_backlog_count: int = 0
116
+ max_queue_backlog: int = 10
117
+ last_latency_ms: float = 0.0
118
+ sla_ceiling_ms: float = 500.0
119
+ current_step: int = 0
120
+ max_steps: int = 20
121
+ episode_done: bool = False
122
+ history: List[Dict[str, Any]] = field(default_factory=list)
123
+
124
+ # Windowed success tracking (last N requests per provider)
125
+ provider_window: Dict[str, List[bool]] = field(default_factory=dict)
126
+ window_size: int = 5
127
+
128
+ # Probed providers: tracks which providers have been routed to at least once
129
+ probed_providers: set = field(default_factory=set)
130
+
131
+ # Resolved (jittered) degradation onsets for this episode
132
+ actual_degradation_start: int = 0
133
+ actual_secondary_degradation_start: int = 999
134
+
135
+ def get_windowed_success_rate(self, provider_name: str) -> float:
136
+ """Get success rate over the last `window_size` requests for a provider."""
137
+ window = self.provider_window.get(provider_name, [])
138
+ if not window:
139
+ return self.providers[provider_name].base_reliability
140
+ return sum(window) / len(window)
141
+
142
+
143
+ # =============================================================================
144
+ # Task Configuration
145
+ # =============================================================================
146
+
147
+
148
+ @dataclass
149
+ class TaskConfig:
150
+ """
151
+ Configuration for a task scenario. Passed to reset(scenario=config).
152
+ NOT a subclass — just a data container.
153
+ """
154
+
155
+ name: str
156
+ description: str
157
+
158
+ # Budget
159
+ initial_budget: float = 5.0 # dollars
160
+
161
+ # Provider costs (per request, dollars)
162
+ cost_a: float = 0.01
163
+ cost_b: float = 0.05
164
+ cost_c: float = 0.10
165
+
166
+ # Provider base reliability
167
+ reliability_a: float = 0.70
168
+ reliability_b: float = 0.90
169
+ reliability_c: float = 0.99
170
+
171
+ # Provider base latency (ms)
172
+ latency_a: float = 100.0
173
+ latency_b: float = 150.0
174
+ latency_c: float = 200.0
175
+
176
+ # SLA
177
+ sla_ceiling_ms: float = 500.0
178
+
179
+ # Degradation config (primary)
180
+ degradation_start_step: int = 0 # step at which degradation begins
181
+ degradation_rate: float = 0.0 # health reduction per step for provider A
182
+ degradation_target: str = "A" # which provider degrades
183
+ degradation_start_jitter: int = 0 # ±jitter applied per episode to degradation_start_step
184
+
185
+ # Secondary degradation (for multi-provider scenarios)
186
+ secondary_degradation_start_step: int = 999 # 999 = no secondary degradation
187
+ secondary_degradation_rate: float = 0.0
188
+ secondary_degradation_target: str = "" # empty = no secondary degradation
189
+ secondary_degradation_start_jitter: int = 0 # ±jitter applied per episode to secondary_degradation_start_step
190
+
191
+ # Episode
192
+ max_steps: int = 20
193
+ max_queue_backlog: int = 10
194
+
195
+ # Stochastic noise
196
+ latency_noise_std: float = 30.0 # ms std dev added to base latency
197
+
198
+
199
+ # =============================================================================
200
+ # OpenEnv State — extends BaseState
201
+ # =============================================================================
202
+
203
+
204
+ @dataclass
205
+ class EnvState(BaseState):
206
+ """
207
+ OpenEnv-compatible state object returned by the `state` property.
208
+ Extends BaseState (which provides `episode_id`, `step_count` fields).
209
+ """
210
+
211
+ scenario_name: str = ""
212
+ is_done: bool = False
budget_router/policies.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Policies for the Budget Router environment.
3
+
4
+ 6 policies:
5
+ - random_policy: uniform random baseline (lower bound)
6
+ - heuristic_baseline_policy: stateless cheapest-viable routing
7
+ - debug_upper_bound_policy: oracle with internal state access (test only)
8
+ - always_route_a_policy: degenerate (always cheapest)
9
+ - always_route_b_policy: degenerate (always balanced fallback)
10
+ - always_route_c_policy: degenerate (always most reliable)
11
+ - always_shed_load_policy: degenerate (always shed)
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import random as stdlib_random
17
+ from typing import Optional
18
+
19
+ from .models import Action, ActionType, InternalState, Observation
20
+ from .reward import BUDGET_WEIGHT
21
+
22
+
23
+ def random_policy(obs: Observation, rng: Optional[stdlib_random.Random] = None) -> Action:
24
+ """Uniform random over all 4 actions. No state awareness."""
25
+ r = rng or stdlib_random.Random()
26
+ choice = r.choice(list(ActionType))
27
+ return Action(action_type=choice)
28
+
29
+
30
+ def heuristic_baseline_policy(obs: Observation) -> Action:
31
+ """
32
+ Stateless heuristic: prefer cheapest provider with status > threshold.
33
+ Fallback to next cheapest. shed_load only if ALL below threshold.
34
+
35
+ Budget-aware: when budget is critically low, only use the cheapest
36
+ viable provider or shed load to avoid the -10 budget exhaustion penalty.
37
+ No privileged information. Uses only what the agent can observe.
38
+ """
39
+ threshold = 0.52
40
+
41
+ # Providers ordered by cost (cheapest first): A, B, C
42
+ providers = [
43
+ ("route_to_a", obs.provider_a_status),
44
+ ("route_to_b", obs.provider_b_status),
45
+ ("route_to_c", obs.provider_c_status),
46
+ ]
47
+
48
+ # Budget safety: when critically low, exclude expensive providers
49
+ # to prevent the -10.0 terminal budget exhaustion penalty.
50
+ # Only blocks C ($0.10/req) when budget can't absorb it.
51
+ if obs.budget_remaining < 0.10:
52
+ # Only consider A ($0.01) and B ($0.05) — skip C
53
+ for action_name, status in providers[:2]:
54
+ if status > threshold or status == 0.5:
55
+ return Action(action_type=ActionType(action_name))
56
+ return Action(action_type=ActionType.SHED_LOAD)
57
+
58
+ for action_name, status in providers:
59
+ if status > threshold or status == 0.5:
60
+ return Action(action_type=ActionType(action_name))
61
+
62
+ # All providers below threshold → shed load
63
+ return Action(action_type=ActionType.SHED_LOAD)
64
+
65
+
66
+ def debug_upper_bound_policy(obs: Observation, internal_state: InternalState) -> Action:
67
+ """
68
+ Oracle policy with access to true internal health values.
69
+ Used ONLY for debugging and validation — NOT a fair benchmark.
70
+
71
+ Strategy: expected-value routing using true health, with hard budget
72
+ feasibility constraint. Routes to the cheapest provider whose health
73
+ is high enough, but won't pick an expensive provider if it would
74
+ exhaust the budget.
75
+ """
76
+ initial_budget = internal_state.initial_budget_dollars
77
+ if initial_budget <= 0:
78
+ initial_budget = 1.0
79
+
80
+ budget_dollars = internal_state.budget_dollars
81
+ remaining_steps = max(1, internal_state.max_steps - internal_state.current_step)
82
+
83
+ providers_info = [
84
+ ("route_to_a", internal_state.providers["A"].current_health,
85
+ internal_state.providers["A"].cost_per_request),
86
+ ("route_to_b", internal_state.providers["B"].current_health,
87
+ internal_state.providers["B"].cost_per_request),
88
+ ("route_to_c", internal_state.providers["C"].current_health,
89
+ internal_state.providers["C"].cost_per_request),
90
+ ]
91
+
92
+ best_action = None
93
+ best_ev = float("-inf")
94
+
95
+ for action_name, health, cost in providers_info:
96
+ # Hard feasibility: can we afford this provider for remaining steps?
97
+ # If not, skip it entirely to avoid budget exhaustion penalty (-10)
98
+ if cost * remaining_steps > budget_dollars:
99
+ continue
100
+
101
+ # Expected per-step reward matching reward.py:
102
+ # P(success) * 1.0 + P(fail) * -2.0 - (cost/initial_budget) * BUDGET_WEIGHT
103
+ ev = health * 1.0 + (1.0 - health) * (-2.0) - (cost / initial_budget) * BUDGET_WEIGHT
104
+
105
+ if ev > best_ev:
106
+ best_ev = ev
107
+ best_action = action_name
108
+
109
+ if best_action is None:
110
+ # No affordable provider — pick the cheapest one we can still afford once
111
+ for action_name, health, cost in providers_info:
112
+ if cost <= budget_dollars:
113
+ ev = health * 1.0 + (1.0 - health) * (-2.0) - (cost / initial_budget) * BUDGET_WEIGHT
114
+ if ev > best_ev:
115
+ best_ev = ev
116
+ best_action = action_name
117
+
118
+ if best_action is None or best_ev < -0.5:
119
+ return Action(action_type=ActionType.SHED_LOAD)
120
+
121
+ return Action(action_type=ActionType(best_action))
122
+
123
+
124
+ def always_route_a_policy(obs: Observation) -> Action:
125
+ """Degenerate: always route to cheapest provider A."""
126
+ return Action(action_type=ActionType.ROUTE_TO_A)
127
+
128
+
129
+ def always_route_b_policy(obs: Observation) -> Action:
130
+ """Degenerate: always route to balanced provider B."""
131
+ return Action(action_type=ActionType.ROUTE_TO_B)
132
+
133
+
134
+ def always_route_c_policy(obs: Observation) -> Action:
135
+ """Degenerate: always route to most expensive/reliable provider C."""
136
+ return Action(action_type=ActionType.ROUTE_TO_C)
137
+
138
+
139
+ def always_shed_load_policy(obs: Observation) -> Action:
140
+ """Degenerate: always shed load (never routes)."""
141
+ return Action(action_type=ActionType.SHED_LOAD)
budget_router/reward.py ADDED
@@ -0,0 +1,281 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Reward computation for the Budget Router environment.
3
+
4
+ Per-step reward (4 additive terms max) and episode-level grader metrics.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import math
10
+ from typing import Any, Dict, List
11
+
12
+
13
+ BUDGET_WEIGHT = 5.0 # Scales cost penalty so it's meaningful vs success/failure signal
14
+
15
+
16
+ def step_reward(
17
+ action_type: str,
18
+ request_succeeded: bool,
19
+ provider_cost: float,
20
+ initial_budget: float,
21
+ latency_ms: float,
22
+ sla_ceiling_ms: float,
23
+ ) -> float:
24
+ """
25
+ Compute single-step reward. Maximum 4 additive terms.
26
+
27
+ For shed_load: fixed penalty of -0.5 (replaces routing terms).
28
+ For routing actions:
29
+ +1.0 if request succeeded, -2.0 if failed
30
+ -(provider_cost / initial_budget) * BUDGET_WEIGHT as cost penalty
31
+ -(excess_latency / sla_ceiling_ms) if latency exceeds SLA
32
+
33
+ Returns:
34
+ float: The step reward. Never returns NaN.
35
+ """
36
+ # Safety: prevent NaN from division by zero
37
+ if initial_budget <= 0:
38
+ initial_budget = 1.0
39
+ if sla_ceiling_ms <= 0:
40
+ sla_ceiling_ms = 1.0
41
+
42
+ # shed_load: flat penalty, no routing terms
43
+ if action_type == "shed_load":
44
+ return -0.5
45
+
46
+ reward = 0.0
47
+
48
+ # Term 1: Success / failure
49
+ if request_succeeded:
50
+ reward += 1.0
51
+ else:
52
+ reward += -2.0
53
+
54
+ # Term 2: Cost penalty (always applied for routing actions)
55
+ cost_penalty = -(provider_cost / initial_budget) * BUDGET_WEIGHT
56
+ reward += cost_penalty
57
+
58
+ # Term 3: Latency breach penalty
59
+ if latency_ms > sla_ceiling_ms:
60
+ excess = latency_ms - sla_ceiling_ms
61
+ latency_penalty = -(excess / sla_ceiling_ms)
62
+ reward += latency_penalty
63
+
64
+ # Safety: NaN guard
65
+ if math.isnan(reward):
66
+ reward = -2.0
67
+
68
+ return reward
69
+
70
+
71
+ def episode_metrics(history: List[Dict[str, Any]]) -> Dict[str, Any]:
72
+ """
73
+ Compute deterministic episode-level grader metrics.
74
+
75
+ Args:
76
+ history: List of step info dicts from the episode.
77
+
78
+ Returns:
79
+ Dict with grader metrics:
80
+ - total_reward
81
+ - success_rate
82
+ - total_cost_spent
83
+ - average_latency_ms
84
+ - sla_met (bool)
85
+ - queue_overflow_events (int)
86
+ """
87
+ if not history:
88
+ return {
89
+ "total_reward": 0.0,
90
+ "success_rate": 0.0,
91
+ "total_cost_spent": 0.0,
92
+ "average_latency_ms": 0.0,
93
+ "sla_met": True,
94
+ "queue_overflow_events": 0,
95
+ }
96
+
97
+ total_reward = sum(h.get("reward", 0.0) for h in history)
98
+
99
+ # Only count routing steps (not shed_load) for success rate
100
+ routing_steps = [h for h in history if h.get("action_type") != "shed_load"]
101
+ if routing_steps:
102
+ successes = sum(1 for h in routing_steps if h.get("request_succeeded", False))
103
+ success_rate = successes / len(routing_steps)
104
+ else:
105
+ success_rate = 0.0
106
+
107
+ total_cost = sum(h.get("cost", 0.0) for h in history)
108
+
109
+ latencies = [h.get("latency_ms", 0.0) for h in routing_steps]
110
+ avg_latency = sum(latencies) / len(latencies) if latencies else 0.0
111
+
112
+ sla_ceiling = history[0].get("sla_ceiling_ms", 500.0)
113
+ sla_met = all(lat <= sla_ceiling for lat in latencies) if latencies else True
114
+
115
+ queue_overflows = sum(1 for h in history if h.get("queue_overflow", False))
116
+
117
+ return {
118
+ "total_reward": round(total_reward, 4),
119
+ "success_rate": round(success_rate, 4),
120
+ "total_cost_spent": round(total_cost, 4),
121
+ "average_latency_ms": round(avg_latency, 2),
122
+ "sla_met": sla_met,
123
+ "queue_overflow_events": queue_overflows,
124
+ }
125
+
126
+
127
+ def grade_episode(history: List[Dict[str, Any]]) -> Dict[str, Any]:
128
+ """
129
+ Compute episode-level grader score in [0, 1] with weighted breakdown.
130
+
131
+ overall = 0.30 × success_score
132
+ + 0.20 × latency_score
133
+ + 0.15 × budget_score
134
+ + 0.15 × sla_score
135
+ + 0.20 × adaptation_score
136
+
137
+ Component definitions:
138
+ success_score: Fraction of ALL episode steps with a successful routed request.
139
+ Denominator = total steps (not routed steps), so partial abstention is penalised.
140
+ latency_score: 1.0 - (avg_latency / sla_ceiling), clamped to [0, 1].
141
+ budget_score: Fraction of initial budget NOT spent, clamped to [0, 1].
142
+ sla_score: Fraction of routed requests with latency <= sla_ceiling.
143
+ adaptation_score: Post-degradation success rate — measures whether the
144
+ agent detected and adapted to provider degradation.
145
+
146
+ Adaptation score window semantics by task:
147
+ - easy (no degradation): No post-degradation window exists.
148
+ adaptation_score = 1.0 (adaptation not required → full marks).
149
+ - medium (A degrades after step 5): Window = routing steps with
150
+ step > 5. Measures success rate after A begins failing.
151
+ - hard (A degrades from step 0): Window = routing steps with
152
+ step > 1 (one warm-up step allowed). Covers nearly all steps.
153
+ - hard_multi (A from step 0, B from step 10): Blended score:
154
+ 0.5 × primary_adaptation (steps between primary and secondary)
155
+ + 0.5 × secondary_adaptation (steps after secondary event).
156
+
157
+ All component scores are clamped to [0.0, 1.0].
158
+
159
+ Args:
160
+ history: List of step info dicts from the episode.
161
+
162
+ Returns:
163
+ Dict with 'overall_score' and per-component breakdown.
164
+ """
165
+ # Note: step_reward() is shaped for learning signal (dense + budget cliff).
166
+ # grade_episode() is the semantic evaluation metric. Divergence is intentional.
167
+ if not history:
168
+ return {
169
+ "overall_score": 0.0,
170
+ "success_score": 0.0,
171
+ "latency_score": 0.0,
172
+ "budget_score": 0.0,
173
+ "sla_score": 0.0,
174
+ "adaptation_score": 0.0,
175
+ }
176
+
177
+ metrics = episode_metrics(history)
178
+
179
+ # success_score: fraction of ALL episode steps that resulted in a successful routed request.
180
+ # Denominator is total steps, not routed steps, so partial abstention is penalised.
181
+ # A policy that serves 10/20 and succeeds each time scores 0.50, not 1.0.
182
+ total_steps = len(history)
183
+ routing_steps = [h for h in history if h.get("action_type") != "shed_load"]
184
+ routed_successes = sum(1 for h in routing_steps if h.get("request_succeeded", False))
185
+ success_score = routed_successes / total_steps if total_steps > 0 else 0.0
186
+
187
+ sla_ceiling_ms = float(history[0].get("sla_ceiling_ms", 500.0) or 500.0)
188
+ avg_latency_ms = float(metrics.get("average_latency_ms", 0.0))
189
+
190
+ if sla_ceiling_ms <= 0:
191
+ sla_ceiling_ms = 1.0
192
+
193
+
194
+ # Fix 1: No routing attempts = no service delivered. Quality scores must reflect this.
195
+ if routing_steps:
196
+ latency_score = 1.0 - min(1.0, avg_latency_ms / sla_ceiling_ms)
197
+ sla_ok = sum(1 for h in routing_steps if float(h.get("latency_ms", 0.0)) <= sla_ceiling_ms)
198
+ sla_score = sla_ok / len(routing_steps)
199
+ else:
200
+ latency_score = 0.0
201
+ sla_score = 0.0
202
+
203
+ # Budget score: penalize spending relative to initial budget, not theoretical max
204
+ total_cost = float(metrics.get("total_cost_spent", 0.0))
205
+ initial_budget = float(history[0].get("initial_budget", 1.0) or 1.0)
206
+ budget_score = max(0.0, 1.0 - total_cost / initial_budget)
207
+
208
+ # Adaptation score: measures post-degradation success rate.
209
+ # Directly measures whether the agent detected and adapted to degradation.
210
+ adaptation_score = 0.0
211
+ _raw_degrade = history[0].get("degradation_start_step")
212
+ degradation_start = int(_raw_degrade) if _raw_degrade is not None else 999
213
+ _raw_secondary = history[0].get("secondary_degradation_start_step")
214
+ secondary_start = int(_raw_secondary) if _raw_secondary is not None else None
215
+
216
+ if degradation_start < 999:
217
+ if secondary_start is not None:
218
+ # Fix 2: hard_multi — blended adaptation across primary and secondary windows
219
+ primary_window = [h for h in routing_steps
220
+ if int(h.get("step", 0)) > max(degradation_start, 1)
221
+ and int(h.get("step", 0)) <= secondary_start]
222
+ secondary_window = [h for h in routing_steps
223
+ if int(h.get("step", 0)) > secondary_start]
224
+
225
+ if primary_window:
226
+ primary_adaptation = sum(1 for h in primary_window if h.get("request_succeeded", False)) / len(primary_window)
227
+ else:
228
+ primary_adaptation = 0.0
229
+
230
+ if secondary_window:
231
+ secondary_adaptation = sum(1 for h in secondary_window if h.get("request_succeeded", False)) / len(secondary_window)
232
+ else:
233
+ secondary_adaptation = 0.0
234
+
235
+ if not primary_window and not secondary_window:
236
+ adaptation_score = 0.0
237
+ else:
238
+ adaptation_score = 0.5 * primary_adaptation + 0.5 * secondary_adaptation
239
+ else:
240
+ # Single degradation event: existing logic unchanged
241
+ # Use max(degradation_start, 1) to ensure at least one warm-up step
242
+ # before post-degradation tracking, even when degradation_start=0
243
+ post_degrade = [h for h in routing_steps
244
+ if int(h.get("step", 0)) > max(degradation_start, 1)]
245
+ if post_degrade:
246
+ post_successes = sum(1 for h in post_degrade if h.get("request_succeeded", False))
247
+ adaptation_score = post_successes / len(post_degrade)
248
+ else:
249
+ # No degradation event. Award adaptation based on routing quality instead.
250
+ # A do-nothing (always shed_load) policy gets 0, not 1.0.
251
+ if routing_steps:
252
+ quality_successes = sum(1 for h in routing_steps if h.get("request_succeeded", False))
253
+ adaptation_score = quality_successes / total_steps # total_steps denominator penalizes abstention
254
+ else:
255
+ adaptation_score = 0.0
256
+
257
+ overall = (
258
+ 0.3 * success_score
259
+ + 0.2 * latency_score
260
+ + 0.15 * budget_score
261
+ + 0.15 * sla_score
262
+ + 0.2 * adaptation_score
263
+ )
264
+
265
+ # Hard penalty for budget exhaustion: incomplete episodes are not reliable systems.
266
+ # A policy that routes aggressively and goes bankrupt at step 17 should not outscore
267
+ # one that completes all 20 steps. 0.75x preserves partial credit for good routing
268
+ # before exhaustion, but makes budget-exhausted policies non-competitive.
269
+ episode_terminated_early = any(h.get('budget_exhausted', False) for h in history)
270
+ if episode_terminated_early:
271
+ overall = overall * 0.75
272
+
273
+ overall = max(0.0, min(1.0, overall))
274
+ return {
275
+ "overall_score": round(overall, 4),
276
+ "success_score": round(max(0.0, min(1.0, success_score)), 4),
277
+ "latency_score": round(max(0.0, min(1.0, latency_score)), 4),
278
+ "budget_score": round(max(0.0, min(1.0, budget_score)), 4),
279
+ "sla_score": round(max(0.0, min(1.0, sla_score)), 4),
280
+ "adaptation_score": round(max(0.0, min(1.0, adaptation_score)), 4),
281
+ }
budget_router/tasks.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Task preset configurations: EASY, MEDIUM, HARD.
3
+
4
+ Each is a TaskConfig instance passed to reset(scenario=config).
5
+ """
6
+
7
+ from .models import TaskConfig
8
+
9
+
10
+ EASY = TaskConfig(
11
+ name="easy",
12
+ description="Stable providers. Cheapest is viable but not dominant. Smart routing wins.",
13
+ initial_budget=1.0,
14
+ cost_a=0.01,
15
+ cost_b=0.05,
16
+ cost_c=0.10,
17
+ reliability_a=0.76, # lowered so always-A isn't dominant; forces routing quality to matter
18
+ reliability_b=0.92,
19
+ reliability_c=0.99,
20
+ latency_a=100.0,
21
+ latency_b=150.0,
22
+ latency_c=200.0,
23
+ sla_ceiling_ms=500.0,
24
+ degradation_start_step=999, # effectively no degradation
25
+ degradation_rate=0.0,
26
+ degradation_target="A",
27
+ max_steps=20,
28
+ max_queue_backlog=10,
29
+ latency_noise_std=30.0,
30
+ )
31
+
32
+
33
+ MEDIUM = TaskConfig(
34
+ name="medium",
35
+ description="Provider A degrades sharply after step 5. Must adapt routing.",
36
+ initial_budget=0.95,
37
+ cost_a=0.01,
38
+ cost_b=0.05,
39
+ cost_c=0.10,
40
+ reliability_a=0.85,
41
+ reliability_b=0.92,
42
+ reliability_c=0.99,
43
+ latency_a=100.0,
44
+ latency_b=150.0,
45
+ latency_c=200.0,
46
+ sla_ceiling_ms=500.0,
47
+ degradation_start_step=5,
48
+ degradation_rate=0.15, # sharp drop after step 5
49
+ degradation_target="A",
50
+ max_steps=20,
51
+ max_queue_backlog=10,
52
+ latency_noise_std=30.0,
53
+ )
54
+
55
+
56
+ HARD = TaskConfig(
57
+ name="hard",
58
+ description="Provider A degrades aggressively from step 0. Tight budget. High noise. Must diversify immediately.",
59
+ initial_budget=0.85,
60
+ cost_a=0.01,
61
+ cost_b=0.05,
62
+ cost_c=0.10,
63
+ reliability_a=0.85,
64
+ reliability_b=0.92,
65
+ reliability_c=0.99,
66
+ latency_a=100.0,
67
+ latency_b=150.0,
68
+ latency_c=200.0,
69
+ sla_ceiling_ms=500.0,
70
+ degradation_start_step=0, # degrades from the start
71
+ degradation_start_jitter=3,
72
+ degradation_rate=0.15, # faster than MEDIUM (was 0.08)
73
+ degradation_target="A",
74
+ max_steps=20,
75
+ max_queue_backlog=10,
76
+ latency_noise_std=50.0, # significantly more noise (was 40.0)
77
+ )
78
+
79
+
80
+ HARD_MULTI = TaskConfig(
81
+ name="hard_multi",
82
+ description="A degrades from step 0, B degrades from step 10. Multi-provider cascade. Slightly wider budget to reward efficient routing.",
83
+ initial_budget=1.10,
84
+ cost_a=0.01,
85
+ cost_b=0.05,
86
+ cost_c=0.10,
87
+ reliability_a=0.85,
88
+ reliability_b=0.92,
89
+ reliability_c=0.99,
90
+ latency_a=100.0,
91
+ latency_b=150.0,
92
+ latency_c=200.0,
93
+ sla_ceiling_ms=500.0,
94
+ degradation_start_step=0,
95
+ degradation_start_jitter=3,
96
+ degradation_rate=0.12,
97
+ degradation_target="A",
98
+ secondary_degradation_start_step=10,
99
+ secondary_degradation_start_jitter=3,
100
+ secondary_degradation_rate=0.10,
101
+ secondary_degradation_target="B",
102
+ max_steps=20,
103
+ max_queue_backlog=10,
104
+ latency_noise_std=50.0,
105
+ )
106
+
107
+
108
+ TASK_PRESETS = {"easy": EASY, "medium": MEDIUM, "hard": HARD, "hard_multi": HARD_MULTI}
budget_router/tests/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Tests for the Budget Router environment - package init."""
budget_router/tests/test_environment.py ADDED
@@ -0,0 +1,502 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Tests for the Budget Router environment core correctness and reward sanity.
3
+
4
+ All tests from <test_requirements> are implemented here.
5
+ """
6
+
7
+ import math
8
+ import random
9
+
10
+ import pytest
11
+
12
+ from budget_router.environment import BudgetRouterEnv
13
+ from budget_router.models import Action, ActionType, Observation
14
+ from budget_router.policies import (
15
+ always_route_a_policy,
16
+ always_route_b_policy,
17
+ always_route_c_policy,
18
+ always_shed_load_policy,
19
+ heuristic_baseline_policy,
20
+ random_policy,
21
+ )
22
+ from budget_router.reward import step_reward
23
+ from budget_router.tasks import EASY, HARD, HARD_MULTI, MEDIUM
24
+
25
+
26
+ # ─── Helpers ────────────────────────────────────────────────────────────
27
+
28
+
29
+ def run_full_episode(env, policy_fn, seed, scenario, policy_name=""):
30
+ """Run a full episode and return (observations, rewards, done_flag, steps)."""
31
+ obs = env.reset(seed=seed, scenario=scenario)
32
+ observations = [obs]
33
+ rewards = []
34
+ steps = 0
35
+ rng = random.Random(seed + 10000) if "random" in policy_name else None
36
+
37
+ while not obs.done and steps < scenario.max_steps:
38
+ if "random" in policy_name:
39
+ action = policy_fn(obs, rng=rng)
40
+ else:
41
+ action = policy_fn(obs)
42
+ obs = env.step(action)
43
+ observations.append(obs)
44
+ rewards.append(obs.reward)
45
+ steps += 1
46
+
47
+ return observations, rewards, obs.done, steps
48
+
49
+
50
+ # ─── Core Correctness Tests ────────────────────────────────────────────
51
+
52
+
53
+ class TestCoreCorrectness:
54
+ """Core environment correctness tests."""
55
+
56
+ def test_reset_returns_valid_observation(self):
57
+ """reset() returns Observation with ALL values in [0.0, 1.0]."""
58
+ env = BudgetRouterEnv()
59
+ obs = env.reset(seed=42, scenario=EASY)
60
+
61
+ assert isinstance(obs, Observation)
62
+ assert 0.0 <= obs.provider_a_status <= 1.0
63
+ assert 0.0 <= obs.provider_b_status <= 1.0
64
+ assert 0.0 <= obs.provider_c_status <= 1.0
65
+ assert 0.0 <= obs.budget_remaining <= 1.0
66
+ assert 0.0 <= obs.queue_backlog <= 1.0
67
+ assert 0.0 <= obs.system_latency <= 1.0
68
+ assert 0.0 <= obs.step_count <= 1.0
69
+
70
+ def test_step_after_reset_no_crash(self):
71
+ """step() after reset() does not crash and returns valid types."""
72
+ env = BudgetRouterEnv()
73
+ obs = env.reset(seed=42, scenario=EASY)
74
+ action = Action(action_type=ActionType.ROUTE_TO_A)
75
+ obs = env.step(action)
76
+
77
+ assert isinstance(obs, Observation)
78
+ assert isinstance(obs.done, bool)
79
+ assert isinstance(obs.reward, (int, float))
80
+
81
+ def test_step_before_reset_no_crash(self):
82
+ """step() before reset() auto-initializes so the default OpenEnv web UI is safe."""
83
+ env = BudgetRouterEnv()
84
+ action = Action(action_type=ActionType.ROUTE_TO_A)
85
+ obs = env.step(action)
86
+
87
+ assert isinstance(obs, Observation)
88
+ assert isinstance(obs.done, bool)
89
+ assert isinstance(obs.reward, (int, float))
90
+
91
+ def test_episode_terminates_at_or_before_20(self):
92
+ """Episode terminates at or before step 20."""
93
+ env = BudgetRouterEnv()
94
+ for scenario in [EASY, MEDIUM, HARD]:
95
+ obs = env.reset(seed=42, scenario=scenario)
96
+ steps = 0
97
+ while not obs.done and steps < 25: # give extra margin to catch bugs
98
+ action = Action(action_type=ActionType.ROUTE_TO_B)
99
+ obs = env.step(action)
100
+ steps += 1
101
+ assert steps <= 20, f"Episode ran {steps} steps on {scenario.name}"
102
+
103
+ def test_deterministic_trajectories_same_seed(self):
104
+ """Two reset() calls with same seed produce identical full trajectories."""
105
+ env = BudgetRouterEnv()
106
+
107
+ # Run 1
108
+ obs1_list, rewards1, _, _ = run_full_episode(
109
+ env, heuristic_baseline_policy, seed=42, scenario=MEDIUM
110
+ )
111
+
112
+ # Run 2
113
+ obs2_list, rewards2, _, _ = run_full_episode(
114
+ env, heuristic_baseline_policy, seed=42, scenario=MEDIUM
115
+ )
116
+
117
+ assert len(rewards1) == len(rewards2)
118
+ for r1, r2 in zip(rewards1, rewards2):
119
+ assert r1 == r2, f"Rewards differ: {r1} vs {r2}"
120
+
121
+ def test_budget_remaining_never_nan(self):
122
+ """budget_remaining never returns NaN."""
123
+ env = BudgetRouterEnv()
124
+ for scenario in [EASY, MEDIUM, HARD]:
125
+ observations, _, _, _ = run_full_episode(
126
+ env, heuristic_baseline_policy, seed=42, scenario=scenario
127
+ )
128
+ for obs in observations:
129
+ assert not math.isnan(obs.budget_remaining), "budget_remaining is NaN"
130
+
131
+ def test_provider_status_in_bounds(self):
132
+ """All provider_status values stay in [0.0, 1.0] throughout episode."""
133
+ env = BudgetRouterEnv()
134
+ for scenario in [EASY, MEDIUM, HARD]:
135
+ observations, _, _, _ = run_full_episode(
136
+ env, heuristic_baseline_policy, seed=0, scenario=scenario
137
+ )
138
+ for obs in observations:
139
+ assert 0.0 <= obs.provider_a_status <= 1.0
140
+ assert 0.0 <= obs.provider_b_status <= 1.0
141
+ assert 0.0 <= obs.provider_c_status <= 1.0
142
+
143
+ def test_system_latency_not_always_zero(self):
144
+ """system_latency is NOT always 0.0 across a full episode (dead channel guard)."""
145
+ env = BudgetRouterEnv()
146
+ observations, _, _, _ = run_full_episode(
147
+ env, heuristic_baseline_policy, seed=42, scenario=MEDIUM
148
+ )
149
+ # Skip first observation (from reset) — latency may be initial value
150
+ latencies = [obs.system_latency for obs in observations[1:]]
151
+ assert any(lat > 0.0 for lat in latencies), "system_latency is always 0.0 — dead channel"
152
+
153
+ def test_all_observation_fields_in_range(self):
154
+ """All Observation fields remain within [0.0, 1.0] at every step."""
155
+ env = BudgetRouterEnv()
156
+ for scenario in [EASY, MEDIUM, HARD]:
157
+ for seed in [0, 1, 2]:
158
+ observations, _, _, _ = run_full_episode(
159
+ env, heuristic_baseline_policy, seed=seed, scenario=scenario
160
+ )
161
+ for obs in observations:
162
+ assert 0.0 <= obs.provider_a_status <= 1.0
163
+ assert 0.0 <= obs.provider_b_status <= 1.0
164
+ assert 0.0 <= obs.provider_c_status <= 1.0
165
+ assert 0.0 <= obs.budget_remaining <= 1.0
166
+ assert 0.0 <= obs.queue_backlog <= 1.0
167
+ assert 0.0 <= obs.system_latency <= 1.0
168
+ assert 0.0 <= obs.step_count <= 1.0
169
+
170
+
171
+ # ─── Reward Sanity Tests ───────────────────────────────────────────────
172
+
173
+
174
+ class TestRewardSanity:
175
+ """Reward correctness tests."""
176
+
177
+ def test_shed_load_reward_less_than_successful_route_c(self):
178
+ """shed_load reward < successful route_to_c reward (holding all else equal)."""
179
+ shed_r = step_reward("shed_load", False, 0.0, 5.0, 0.0, 500.0)
180
+ route_c_r = step_reward("route_to_c", True, 0.10, 5.0, 200.0, 500.0)
181
+ assert shed_r < route_c_r, f"shed ({shed_r}) >= route_c success ({route_c_r})"
182
+
183
+ def test_failed_route_less_than_successful_route(self):
184
+ """Failed route reward < successful route reward."""
185
+ failed_r = step_reward("route_to_a", False, 0.01, 5.0, 300.0, 500.0)
186
+ success_r = step_reward("route_to_a", True, 0.01, 5.0, 100.0, 500.0)
187
+ assert failed_r < success_r, f"failed ({failed_r}) >= success ({success_r})"
188
+
189
+ def test_route_a_cost_less_than_route_c_cost(self):
190
+ """route_to_a cost < route_to_c cost in info dict."""
191
+ env = BudgetRouterEnv()
192
+ env.reset(seed=42, scenario=EASY)
193
+
194
+ obs_a = env.step(Action(action_type=ActionType.ROUTE_TO_A))
195
+ cost_a = obs_a.metadata.get("cost", 0)
196
+
197
+ env.reset(seed=42, scenario=EASY)
198
+ obs_c = env.step(Action(action_type=ActionType.ROUTE_TO_C))
199
+ cost_c = obs_c.metadata.get("cost", 0)
200
+
201
+ assert cost_a < cost_c, f"cost_a ({cost_a}) >= cost_c ({cost_c})"
202
+
203
+ def test_route_a_under_hard_degradation_lower_cumulative(self):
204
+ """route_to_a under hard degradation gets lower cumulative reward than route_to_c."""
205
+ env = BudgetRouterEnv()
206
+ seeds = [0, 1, 2, 3, 4]
207
+
208
+ total_a = 0.0
209
+ total_c = 0.0
210
+
211
+ for seed in seeds:
212
+ _, rewards_a, _, _ = run_full_episode(
213
+ env, always_route_a_policy, seed=seed, scenario=HARD
214
+ )
215
+ total_a += sum(r or 0 for r in rewards_a)
216
+
217
+ _, rewards_c, _, _ = run_full_episode(
218
+ env, always_route_c_policy, seed=seed, scenario=HARD
219
+ )
220
+ total_c += sum(r or 0 for r in rewards_c)
221
+
222
+ assert total_a < total_c, (
223
+ f"always_route_a ({total_a:.2f}) >= always_route_c ({total_c:.2f}) on HARD"
224
+ )
225
+
226
+
227
+ # ─── Degenerate Policy Sanity ──────────────────────────────────────────
228
+
229
+
230
+ class TestDegeneratePolicySanity:
231
+ """Degenerate policy tests."""
232
+
233
+ def test_always_route_a_does_not_dominate_baseline_medium(self):
234
+ """always_route_a does not dominate heuristic baseline on medium across dev seeds."""
235
+ env = BudgetRouterEnv()
236
+ seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
237
+
238
+ baseline_rewards = []
239
+ always_a_rewards = []
240
+
241
+ for seed in seeds:
242
+ _, rewards, _, _ = run_full_episode(
243
+ env, heuristic_baseline_policy, seed=seed, scenario=MEDIUM
244
+ )
245
+ baseline_rewards.append(sum(r or 0 for r in rewards))
246
+
247
+ _, rewards, _, _ = run_full_episode(
248
+ env, always_route_a_policy, seed=seed, scenario=MEDIUM
249
+ )
250
+ always_a_rewards.append(sum(r or 0 for r in rewards))
251
+
252
+ baseline_mean = sum(baseline_rewards) / len(baseline_rewards)
253
+ always_a_mean = sum(always_a_rewards) / len(always_a_rewards)
254
+
255
+ assert baseline_mean >= always_a_mean, (
256
+ f"always_route_a ({always_a_mean:.2f}) dominates baseline ({baseline_mean:.2f}) on medium"
257
+ )
258
+
259
+ def test_always_route_c_does_not_dominate_baseline_overall(self):
260
+ """always_route_c does not dominate heuristic baseline across all tasks."""
261
+ env = BudgetRouterEnv()
262
+ seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
263
+
264
+ baseline_total = 0.0
265
+ always_c_total = 0.0
266
+
267
+ for scenario in [EASY, MEDIUM, HARD]:
268
+ for seed in seeds:
269
+ _, rewards, _, _ = run_full_episode(
270
+ env, heuristic_baseline_policy, seed=seed, scenario=scenario
271
+ )
272
+ baseline_total += sum(r or 0 for r in rewards)
273
+
274
+ _, rewards, _, _ = run_full_episode(
275
+ env, always_route_c_policy, seed=seed, scenario=scenario
276
+ )
277
+ always_c_total += sum(r or 0 for r in rewards)
278
+
279
+ assert baseline_total >= always_c_total, (
280
+ f"always_route_c ({always_c_total:.2f}) dominates baseline ({baseline_total:.2f}) overall"
281
+ )
282
+
283
+ def test_always_route_b_does_not_dominate_baseline_medium(self):
284
+ """always_route_b does not dominate heuristic baseline on medium across dev seeds."""
285
+ env = BudgetRouterEnv()
286
+ seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
287
+
288
+ baseline_rewards = []
289
+ always_b_rewards = []
290
+
291
+ for seed in seeds:
292
+ _, rewards, _, _ = run_full_episode(
293
+ env, heuristic_baseline_policy, seed=seed, scenario=MEDIUM
294
+ )
295
+ baseline_rewards.append(sum(r or 0 for r in rewards))
296
+
297
+ _, rewards, _, _ = run_full_episode(
298
+ env, always_route_b_policy, seed=seed, scenario=MEDIUM
299
+ )
300
+ always_b_rewards.append(sum(r or 0 for r in rewards))
301
+
302
+ baseline_mean = sum(baseline_rewards) / len(baseline_rewards)
303
+ always_b_mean = sum(always_b_rewards) / len(always_b_rewards)
304
+
305
+ assert baseline_mean >= always_b_mean, (
306
+ f"always_route_b ({always_b_mean:.2f}) dominates baseline ({baseline_mean:.2f}) on medium"
307
+ )
308
+
309
+ def test_always_shed_load_worse_than_baseline_easy(self):
310
+ """always_shed_load performs materially worse than heuristic baseline on easy."""
311
+ env = BudgetRouterEnv()
312
+ seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
313
+
314
+ baseline_rewards = []
315
+ always_shed_rewards = []
316
+
317
+ for seed in seeds:
318
+ _, rewards, _, _ = run_full_episode(
319
+ env, heuristic_baseline_policy, seed=seed, scenario=EASY
320
+ )
321
+ baseline_rewards.append(sum(r or 0 for r in rewards))
322
+
323
+ _, rewards, _, _ = run_full_episode(
324
+ env, always_shed_load_policy, seed=seed, scenario=EASY
325
+ )
326
+ always_shed_rewards.append(sum(r or 0 for r in rewards))
327
+
328
+ baseline_mean = sum(baseline_rewards) / len(baseline_rewards)
329
+ shed_mean = sum(always_shed_rewards) / len(always_shed_rewards)
330
+
331
+ assert baseline_mean > shed_mean, (
332
+ f"always_shed ({shed_mean:.2f}) >= baseline ({baseline_mean:.2f}) on easy"
333
+ )
334
+
335
+
336
+ class TestBehavioralGuards:
337
+ """Behavioral regression tests for the repo's core adaptation claims."""
338
+
339
+ def test_heuristic_outperforms_always_route_a_on_hard_dev_seeds(self):
340
+ """On HARD, reactive routing must beat the cheapest non-adaptive baseline."""
341
+ env = BudgetRouterEnv()
342
+ seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
343
+
344
+ heuristic_rewards = []
345
+ always_a_rewards = []
346
+
347
+ for seed in seeds:
348
+ _, rewards, _, _ = run_full_episode(
349
+ env, heuristic_baseline_policy, seed=seed, scenario=HARD
350
+ )
351
+ heuristic_rewards.append(sum(r or 0 for r in rewards))
352
+
353
+ _, rewards, _, _ = run_full_episode(
354
+ env, always_route_a_policy, seed=seed, scenario=HARD
355
+ )
356
+ always_a_rewards.append(sum(r or 0 for r in rewards))
357
+
358
+ heuristic_mean = sum(heuristic_rewards) / len(heuristic_rewards)
359
+ always_a_mean = sum(always_a_rewards) / len(always_a_rewards)
360
+
361
+ assert heuristic_mean > always_a_mean, (
362
+ f"heuristic ({heuristic_mean:.2f}) must beat always_route_a "
363
+ f"({always_a_mean:.2f}) on hard dev seeds"
364
+ )
365
+
366
+ def test_heuristic_completes_hard_multi_without_budget_exhaustion(self):
367
+ """On HARD_MULTI dev seeds, the baseline should finish without budget bankruptcy."""
368
+ env = BudgetRouterEnv()
369
+ seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
370
+
371
+ for seed in seeds:
372
+ _, _, done, steps = run_full_episode(
373
+ env, heuristic_baseline_policy, seed=seed, scenario=HARD_MULTI
374
+ )
375
+ exhausted = any(
376
+ step.get("budget_exhausted", False) for step in env._internal.history
377
+ )
378
+
379
+ assert done, f"heuristic did not terminate on hard_multi seed={seed}"
380
+ assert steps == HARD_MULTI.max_steps, (
381
+ f"heuristic ended after {steps} steps, expected {HARD_MULTI.max_steps} "
382
+ f"on hard_multi seed={seed}"
383
+ )
384
+ assert not exhausted, (
385
+ f"heuristic hit budget exhaustion on hard_multi seed={seed}"
386
+ )
387
+
388
+
389
+ # ─── Grader Semantic Tests ──────────────────────────────────────────────
390
+
391
+
392
+ class TestGraderSemantics:
393
+ """Pin the exact grader semantics changed by the abstention and hard_multi fixes.
394
+
395
+ These tests defend against regressions to grade_episode() — the most
396
+ judge-sensitive function in the repo.
397
+ """
398
+
399
+ def _make_step(self, step, action, succeeded, cost, latency, degrade=999, secondary=None):
400
+ return {
401
+ "step": step, "action_type": action,
402
+ "request_succeeded": succeeded, "cost": cost,
403
+ "latency_ms": latency, "reward": 0.9,
404
+ "sla_ceiling_ms": 500.0, "initial_budget": 1.0,
405
+ "degradation_start_step": degrade,
406
+ "secondary_degradation_start_step": secondary,
407
+ }
408
+
409
+ def test_pure_abstention_scores_below_0_40_on_easy(self):
410
+ """A policy that sheds all load must score < 0.40 overall on easy.
411
+
412
+ Before the fix this scored ~0.70 (sla=1.0, latency=1.0 on empty routing set).
413
+ """
414
+ from budget_router.reward import grade_episode
415
+
416
+ history = [
417
+ self._make_step(i, "shed_load", False, 0.0, 0.0, degrade=999)
418
+ for i in range(1, 21)
419
+ ]
420
+ result = grade_episode(history)
421
+
422
+ assert result["overall_score"] < 0.40, (
423
+ f"Pure abstention scored {result['overall_score']} >= 0.40 on easy "
424
+ f"(grader exploit not fixed)"
425
+ )
426
+ assert result["sla_score"] == 0.0, "sla_score should be 0.0 when no requests routed"
427
+ assert result["latency_score"] == 0.0, "latency_score should be 0.0 when no requests routed"
428
+ assert result["success_score"] == 0.0, "success_score should be 0.0 when no requests routed"
429
+ assert result["budget_score"] == 1.0, "budget_score should be 1.0 when nothing spent"
430
+ assert result["adaptation_score"] == 0.0, (
431
+ "adaptation_score should be 0.0 on easy when the policy only sheds load"
432
+ )
433
+
434
+ def test_partial_abstention_scores_less_than_full_service(self):
435
+ """A policy that sheds 50% of load must score < a policy that serves all 20 steps.
436
+
437
+ Before the success_score denominator fix, partial abstention could outscore
438
+ full service because budget_score rewarded not spending.
439
+ """
440
+ from budget_router.reward import grade_episode
441
+
442
+ # Mixed: 10 sheds then 10 successful routes
443
+ mixed = (
444
+ [self._make_step(i, "shed_load", False, 0.0, 0.0) for i in range(1, 11)]
445
+ + [self._make_step(i, "route_to_a", True, 0.01, 110.0) for i in range(11, 21)]
446
+ )
447
+ # Full service: 20 successful routes
448
+ full = [self._make_step(i, "route_to_a", True, 0.01, 110.0) for i in range(1, 21)]
449
+
450
+ r_mixed = grade_episode(mixed)
451
+ r_full = grade_episode(full)
452
+
453
+ assert r_mixed["overall_score"] < r_full["overall_score"], (
454
+ f"Partial abstention ({r_mixed['overall_score']}) >= full service "
455
+ f"({r_full['overall_score']}) — grader still rewards low-throughput"
456
+ )
457
+ assert r_mixed["success_score"] < r_full["success_score"], (
458
+ f"success_score should be lower for 10/20 served ({r_mixed['success_score']}) "
459
+ f"than 20/20 served ({r_full['success_score']})"
460
+ )
461
+
462
+ def test_hard_multi_adaptation_uses_secondary_window(self):
463
+ """grade_episode computes blended adaptation for hard_multi (secondary window included).
464
+
465
+ Verifies that secondary_degradation_start_step=10 in step_info causes
466
+ grade_episode to split the adaptation window at step 10 and blend 0.5/0.5.
467
+ """
468
+ from budget_router.reward import grade_episode
469
+
470
+ # Build a hard_multi episode: steps 1-10 primary window (route A, succeeds),
471
+ # steps 11-20 secondary window (route A, fails — B degraded, agent stuck)
472
+ history = []
473
+ for i in range(1, 11):
474
+ history.append(self._make_step(i, "route_to_a", True, 0.01, 110.0, degrade=0, secondary=10))
475
+ for i in range(11, 21):
476
+ history.append(self._make_step(i, "route_to_a", False, 0.01, 700.0, degrade=0, secondary=10))
477
+
478
+ result = grade_episode(history)
479
+
480
+ # primary_window: steps > max(0,1)=1 and <= 10 → steps 2..10 → 9 steps, all succeed → 1.0
481
+ # secondary_window: steps > 10 �� steps 11..20 → 10 steps, all fail → 0.0
482
+ # blended = 0.5 * 1.0 + 0.5 * 0.0 = 0.5
483
+ expected_adaptation = 0.5
484
+ assert abs(result["adaptation_score"] - expected_adaptation) < 0.01, (
485
+ f"hard_multi blended adaptation expected ~{expected_adaptation}, "
486
+ f"got {result['adaptation_score']}"
487
+ )
488
+
489
+ # Compare with an equivalent hard (non-multi) episode to confirm they diverge
490
+ history_hard = []
491
+ for i in range(1, 11):
492
+ history_hard.append(self._make_step(i, "route_to_a", True, 0.01, 110.0, degrade=0, secondary=None))
493
+ for i in range(11, 21):
494
+ history_hard.append(self._make_step(i, "route_to_a", False, 0.01, 700.0, degrade=0, secondary=None))
495
+
496
+ result_hard = grade_episode(history_hard)
497
+ # hard (no secondary): post_degrade = steps > max(0,1)=1 → steps 2..20 → 19 steps
498
+ # 9 succeed (steps 2-10), 10 fail (steps 11-20) → 9/19 ≈ 0.473
499
+ assert result["adaptation_score"] != result_hard["adaptation_score"], (
500
+ f"hard_multi and hard got identical adaptation_score={result['adaptation_score']} "
501
+ f"— secondary window not being used"
502
+ )
budget_router/tests/test_eval_all_seed_selection.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import importlib.util
2
+ from pathlib import Path
3
+
4
+ import pytest
5
+
6
+
7
+ def _load_eval_all():
8
+ path = Path(__file__).resolve().parents[2] / "eval" / "eval_all.py"
9
+ spec = importlib.util.spec_from_file_location("eval_all", path)
10
+ module = importlib.util.module_from_spec(spec)
11
+ assert spec.loader is not None
12
+ spec.loader.exec_module(module)
13
+ return module
14
+
15
+
16
+ def test_seed_values_override_named_seed_set():
17
+ eval_all = _load_eval_all()
18
+
19
+ assert eval_all.select_seeds(
20
+ seed_set="dev",
21
+ seeds=3,
22
+ seed_values="200,201,202",
23
+ ) == [200, 201, 202]
24
+
25
+
26
+ def test_seed_values_accept_commas_and_whitespace():
27
+ eval_all = _load_eval_all()
28
+
29
+ assert eval_all.select_seeds(
30
+ seed_set="heldout",
31
+ seeds=1,
32
+ seed_values="200, 201 202",
33
+ ) == [200, 201, 202]
34
+
35
+
36
+ def test_seed_values_reject_empty_input():
37
+ eval_all = _load_eval_all()
38
+
39
+ with pytest.raises(ValueError, match="No explicit seeds"):
40
+ eval_all.select_seeds(seed_set="dev", seeds=3, seed_values=" , ")
41
+
budget_router/tests/test_grpo_training_reward.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+
3
+ # GRPO tests import train/learn_experiment.py, which loads torch, datasets, peft,
4
+ # transformers, trl at module import. Those live under `--extra grpo` (torch alone
5
+ # may exist via `--extra training`, which is not enough).
6
+ for _grpo_mod in ("torch", "datasets", "peft", "transformers", "trl"):
7
+ pytest.importorskip(_grpo_mod)
8
+
9
+ from budget_router.reward import grade_episode
10
+ from train.grpo_env import BudgetRouterGRPOEnv
11
+ from train.learn_experiment import build_dataset, build_system_prompt, reward_func, summarize_training_rollout
12
+
13
+
14
+ def _step_once(env: BudgetRouterGRPOEnv) -> None:
15
+ # Any routing action is fine; we just need non-empty history.
16
+ # Use B as a reasonably stable default.
17
+ try:
18
+ env.route_to_b()
19
+ except ValueError as e:
20
+ # If an episode somehow terminates early, that's fine for the test harness,
21
+ # but it would make the "partial episode" test invalid.
22
+ raise AssertionError(f"Episode ended unexpectedly after one step: {e}") from e
23
+
24
+
25
+ def _run_to_completion(env: BudgetRouterGRPOEnv) -> None:
26
+ # Drive the episode until the GRPO wrapper signals completion.
27
+ while True:
28
+ try:
29
+ env.route_to_b()
30
+ except ValueError:
31
+ return
32
+
33
+
34
+ def test_reward_func_empty_history_returns_zero():
35
+ env = BudgetRouterGRPOEnv()
36
+ env.reset(scenario="hard_multi", seed=0)
37
+
38
+ rewards = reward_func([env])
39
+ assert rewards == [0.0]
40
+
41
+
42
+ def test_reward_func_partial_episode_is_progress_scaled_not_full_grader():
43
+ env = BudgetRouterGRPOEnv()
44
+ env.reset(scenario="hard_multi", seed=0)
45
+
46
+ _step_once(env)
47
+
48
+ internal = env._env._internal
49
+ assert internal.history, "test precondition: history must be non-empty"
50
+ assert not internal.episode_done, "test precondition: episode must be incomplete"
51
+
52
+ grader = float(grade_episode(internal.history)["overall_score"])
53
+ progress = internal.current_step / max(1, internal.max_steps)
54
+ expected = grader * progress
55
+
56
+ # This is the critical regression guard: training reward must not be equal
57
+ # to the raw grader when the episode is incomplete.
58
+ rewards = reward_func([env])
59
+ assert rewards == [pytest.approx(expected, abs=1e-6)]
60
+ assert rewards[0] != pytest.approx(grader, abs=1e-6)
61
+
62
+
63
+ def test_reward_func_complete_episode_equals_full_grader():
64
+ env = BudgetRouterGRPOEnv()
65
+ env.reset(scenario="hard_multi", seed=0)
66
+
67
+ _run_to_completion(env)
68
+
69
+ internal = env._env._internal
70
+ assert internal.history, "test precondition: history must be non-empty"
71
+ assert internal.episode_done, "test precondition: episode must be complete"
72
+
73
+ grader = float(grade_episode(internal.history)["overall_score"])
74
+ rewards = reward_func([env])
75
+ assert rewards == [pytest.approx(grader, abs=1e-6)]
76
+
77
+
78
+ def test_training_rollout_summary_exposes_partial_episode_health():
79
+ env = BudgetRouterGRPOEnv()
80
+ env.reset(scenario="hard_multi", seed=0)
81
+
82
+ _step_once(env)
83
+ _step_once(env)
84
+
85
+ summary = summarize_training_rollout([env])
86
+
87
+ assert summary["env_steps_mean"] == pytest.approx(2.0)
88
+ assert summary["env_steps_min"] == 2
89
+ assert summary["env_steps_max"] == 2
90
+ assert summary["episode_completion_rate"] == 0.0
91
+ assert summary["progress_mean"] == pytest.approx(0.1)
92
+ assert summary["raw_grader_mean"] > summary["training_reward_mean"]
93
+
94
+
95
+ def test_training_rollout_summary_exposes_action_sequence_diversity():
96
+ same_a = BudgetRouterGRPOEnv()
97
+ same_b = BudgetRouterGRPOEnv()
98
+ different = BudgetRouterGRPOEnv()
99
+ for env in (same_a, same_b, different):
100
+ env.reset(scenario="hard_multi", seed=0)
101
+
102
+ same_a.route_to_b()
103
+ same_a.route_to_b()
104
+ same_b.route_to_b()
105
+ same_b.route_to_b()
106
+ different.route_to_a()
107
+ different.route_to_a()
108
+
109
+ summary = summarize_training_rollout([same_a, same_b, different])
110
+
111
+ assert summary["action_sequences"] == [
112
+ "route_to_b route_to_b",
113
+ "route_to_b route_to_b",
114
+ "route_to_a route_to_a",
115
+ ]
116
+ assert summary["unique_action_sequences"] == 2
117
+ assert summary["action_sequence_counts"] == {
118
+ "route_to_b route_to_b": 2,
119
+ "route_to_a route_to_a": 1,
120
+ }
121
+
122
+
123
+ def test_grpo_tool_feedback_is_compact_for_multi_turn_budget():
124
+ env = BudgetRouterGRPOEnv()
125
+ env.reset(scenario="hard_multi", seed=0)
126
+
127
+ feedback = env.route_to_b()
128
+
129
+ assert len(feedback) < 180
130
+ assert "steps_left=" in feedback
131
+ assert "health=" in feedback
132
+
133
+
134
+ def test_explore_prompt_preserves_tool_format_without_deterministic_policy():
135
+ prompt = build_system_prompt("explore")
136
+
137
+ assert "<tool_call>" in prompt
138
+ assert '"name": "route_to_a"' in prompt
139
+ assert "route_to_a" in prompt
140
+ assert "route_to_b" in prompt
141
+ assert "route_to_c" in prompt
142
+ assert "shed_load" in prompt
143
+ assert "0.52" not in prompt
144
+ assert "cheapest healthy provider" not in prompt.lower()
145
+ assert "Observation:" not in prompt
146
+ assert "route_to_a route_to_b route_to_c" not in prompt
147
+
148
+
149
+ def test_build_dataset_uses_requested_prompt_style():
150
+ dataset = build_dataset(n=1, prompt_style="explore")
151
+ system_prompt = dataset[0]["prompt"][0]["content"]
152
+
153
+ assert system_prompt == build_system_prompt("explore")
154
+
budget_router/tests/test_inference_prompt.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from inference import SYSTEM_PROMPT
2
+ from budget_router.models import Observation
3
+ from inference import LLMRouter
4
+
5
+
6
+ def test_system_prompt_has_required_structural_sections():
7
+ upper_prompt = SYSTEM_PROMPT.upper()
8
+ assert "GOLDEN RULE" in upper_prompt or "DEFAULT STRATEGY" in upper_prompt
9
+ assert "BUDGET RUNWAY" in upper_prompt
10
+ assert "TASK PROFILE" in upper_prompt
11
+ assert "NOISE CALIBRATION" in upper_prompt
12
+
13
+
14
+ def test_system_prompt_communicates_bankruptcy_consequence():
15
+ assert "-10" in SYSTEM_PROMPT or "bankruptcy" in SYSTEM_PROMPT.lower()
16
+ assert "0.500" in SYSTEM_PROMPT or "unobserved" in SYSTEM_PROMPT.lower()
17
+
18
+
19
+ class _FakeResponse:
20
+ def __init__(self, content: str) -> None:
21
+ self.choices = [type("Choice", (), {"message": type("Message", (), {"content": content})()})()]
22
+
23
+
24
+ class _FakeClient:
25
+ def with_options(self, **kwargs):
26
+ return self
27
+
28
+ @property
29
+ def chat(self):
30
+ return self
31
+
32
+ @property
33
+ def completions(self):
34
+ return self
35
+
36
+ def create(self, **kwargs):
37
+ return _FakeResponse("route_to_a")
38
+
39
+
40
+ def test_llm_router_preserves_task_name_on_first_step():
41
+ router = LLMRouter(api_base_url="https://example.com/v1", model_name="test-model", api_key="test-key")
42
+ router._client = _FakeClient()
43
+ router.reset(task_name="hard_multi")
44
+
45
+ obs = Observation(
46
+ provider_a_status=0.5,
47
+ provider_b_status=0.5,
48
+ provider_c_status=0.5,
49
+ budget_remaining=1.0,
50
+ queue_backlog=0.0,
51
+ system_latency=0.2,
52
+ step_count=0.0,
53
+ )
54
+
55
+ router.choose_action(obs)
56
+
57
+ assert router._task_name == "hard_multi"
58
+ assert "task: hard_multi" in router._messages[-2]["content"]
59
+
60
+
61
+ def test_objective_feedback_mode_includes_previous_step_feedback():
62
+ router = LLMRouter(
63
+ api_base_url="https://example.com/v1",
64
+ model_name="test-model",
65
+ api_key="test-key",
66
+ prompt_mode="objective_feedback",
67
+ )
68
+ router._client = _FakeClient()
69
+ router.reset(task_name="hard_multi")
70
+
71
+ obs = Observation(
72
+ provider_a_status=0.4,
73
+ provider_b_status=0.7,
74
+ provider_c_status=0.9,
75
+ budget_remaining=0.8,
76
+ queue_backlog=0.1,
77
+ system_latency=0.4,
78
+ step_count=0.5,
79
+ reward=-2.05,
80
+ metadata={
81
+ "action_type": "route_to_a",
82
+ "request_succeeded": False,
83
+ "cost": 0.01,
84
+ "latency_ms": 620.0,
85
+ },
86
+ )
87
+
88
+ router.choose_action(obs)
89
+
90
+ prompt = router._messages[-2]["content"]
91
+ assert "previous_step_feedback:" in prompt
92
+ assert "previous_action: route_to_a" in prompt
93
+ assert "previous_reward: -2.05" in prompt
94
+ assert "previous_success: false" in prompt
budget_router/tests/test_trace_episode.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import importlib.util
2
+ from pathlib import Path
3
+
4
+
5
+ def _load_trace_episode():
6
+ path = Path(__file__).resolve().parents[2] / "eval" / "trace_episode.py"
7
+ spec = importlib.util.spec_from_file_location("trace_episode", path)
8
+ module = importlib.util.module_from_spec(spec)
9
+ assert spec.loader is not None
10
+ spec.loader.exec_module(module)
11
+ return module
12
+
13
+
14
+ def test_trace_episode_returns_step_rows_and_scores_for_heuristic():
15
+ trace_episode = _load_trace_episode()
16
+
17
+ result = trace_episode.trace_episode(task_name="hard_multi", seed=3, policy_name="heuristic")
18
+
19
+ assert result["task"] == "hard_multi"
20
+ assert result["seed"] == 3
21
+ assert result["policy"] == "heuristic"
22
+ assert result["steps"]
23
+ assert len(result["steps"]) == result["episode_length"]
24
+ assert result["total_reward"] == round(sum(step["reward"] for step in result["steps"]), 4)
25
+ assert 0.0 <= result["grader"]["overall_score"] <= 1.0
26
+ assert {"success_rate", "total_cost_spent", "average_latency_ms"}.issubset(result["metrics"])
27
+ assert {
28
+ "provider_a_status",
29
+ "provider_b_status",
30
+ "provider_c_status",
31
+ "observed_budget_remaining",
32
+ }.issubset(result["steps"][0])
33
+
34
+
35
+ def test_trace_episode_rejects_unknown_policy():
36
+ trace_episode = _load_trace_episode()
37
+
38
+ try:
39
+ trace_episode.trace_episode(task_name="hard_multi", seed=3, policy_name="unknown")
40
+ except ValueError as exc:
41
+ assert "Unknown policy" in str(exc)
42
+ else:
43
+ raise AssertionError("unknown policy should raise ValueError")
budget_router/tests/test_validation.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Tests for the validation harness.
3
+
4
+ Covers: policy ordering, solvability, NaN safety, baseline stability,
5
+ and hard task crash resistance.
6
+ """
7
+
8
+ import math
9
+ import random
10
+
11
+ import pytest
12
+
13
+ from budget_router.environment import BudgetRouterEnv
14
+ from budget_router.models import Action, ActionType
15
+ from budget_router.policies import (
16
+ always_route_a_policy,
17
+ always_route_b_policy,
18
+ always_route_c_policy,
19
+ always_shed_load_policy,
20
+ debug_upper_bound_policy,
21
+ heuristic_baseline_policy,
22
+ random_policy,
23
+ )
24
+ from budget_router.tasks import EASY, HARD, MEDIUM
25
+ from budget_router.validation import DEVELOPMENT_SEEDS, HELDOUT_SEEDS, run_episode
26
+
27
+
28
+ # ─── Helpers ────────────────────────────────────────────────────────────
29
+
30
+
31
+ def mean_reward_over_seeds(policy_fn, scenario, seeds, policy_name=""):
32
+ """Compute mean total reward for a policy across seeds."""
33
+ env = BudgetRouterEnv()
34
+ rewards = []
35
+ for seed in seeds:
36
+ metrics = run_episode(env, policy_fn, seed, scenario, policy_name=policy_name)
37
+ rewards.append(metrics["total_reward"])
38
+ return sum(rewards) / len(rewards), rewards
39
+
40
+
41
+ # ─── Validation Tests ──────────────────────────────────────────────────
42
+
43
+
44
+ class TestValidation:
45
+ """Validation-level tests."""
46
+
47
+ def test_baseline_beats_random_easy_dev(self):
48
+ """Baseline beats random on easy task across development seeds."""
49
+ baseline_mean, _ = mean_reward_over_seeds(
50
+ heuristic_baseline_policy, EASY, DEVELOPMENT_SEEDS
51
+ )
52
+ random_mean, _ = mean_reward_over_seeds(
53
+ random_policy, EASY, DEVELOPMENT_SEEDS, policy_name="random"
54
+ )
55
+ assert baseline_mean > random_mean, (
56
+ f"baseline ({baseline_mean:.2f}) <= random ({random_mean:.2f}) on easy"
57
+ )
58
+
59
+ def test_upper_bound_beats_baseline_easy_dev(self):
60
+ """Upper bound beats or matches baseline on easy task across dev seeds."""
61
+ baseline_mean, _ = mean_reward_over_seeds(
62
+ heuristic_baseline_policy, EASY, DEVELOPMENT_SEEDS
63
+ )
64
+ ub_mean, _ = mean_reward_over_seeds(
65
+ debug_upper_bound_policy, EASY, DEVELOPMENT_SEEDS, policy_name="upper_bound"
66
+ )
67
+ assert ub_mean >= baseline_mean, (
68
+ f"oracle ({ub_mean:.2f}) < baseline ({baseline_mean:.2f}) on easy"
69
+ )
70
+
71
+ def test_easy_solvable_positive_reward(self):
72
+ """Easy task is solvable: baseline achieves positive total reward on seed=42."""
73
+ env = BudgetRouterEnv()
74
+ metrics = run_episode(env, heuristic_baseline_policy, seed=42, scenario=EASY)
75
+ assert metrics["total_reward"] > 0, (
76
+ f"baseline achieves {metrics['total_reward']:.2f} on easy/seed=42"
77
+ )
78
+
79
+ def test_hard_no_crash_dev_seeds(self):
80
+ """Hard task terminates without environment crash on development_seeds."""
81
+ env = BudgetRouterEnv()
82
+ for seed in DEVELOPMENT_SEEDS:
83
+ try:
84
+ metrics = run_episode(
85
+ env, heuristic_baseline_policy, seed=seed, scenario=HARD
86
+ )
87
+ assert metrics["episode_length"] <= 20
88
+ except Exception as e:
89
+ pytest.fail(f"Hard task crashed on seed {seed}: {e}")
90
+
91
+ def test_no_nan_rewards_all_combos(self):
92
+ """No reward is NaN across all (task, policy, seed_set) combinations."""
93
+ env = BudgetRouterEnv()
94
+ policies = {
95
+ "random": random_policy,
96
+ "heuristic_baseline": heuristic_baseline_policy,
97
+ "upper_bound": debug_upper_bound_policy,
98
+ "always_route_a": always_route_a_policy,
99
+ "always_route_b": always_route_b_policy,
100
+ "always_route_c": always_route_c_policy,
101
+ "always_shed_load": always_shed_load_policy,
102
+ }
103
+
104
+ for scenario in [EASY, MEDIUM, HARD]:
105
+ for policy_name, policy_fn in policies.items():
106
+ for seed in DEVELOPMENT_SEEDS[:3]: # subset for speed
107
+ metrics = run_episode(
108
+ env, policy_fn, seed, scenario, policy_name=policy_name
109
+ )
110
+ assert not math.isnan(metrics["total_reward"]), (
111
+ f"NaN reward: {scenario.name}/{policy_name}/seed={seed}"
112
+ )
113
+
114
+ def test_baseline_stability_heldout(self):
115
+ """Baseline remains within reasonable stability margin on heldout seeds."""
116
+ for scenario in [EASY, MEDIUM, HARD]:
117
+ dev_mean, _ = mean_reward_over_seeds(
118
+ heuristic_baseline_policy, scenario, DEVELOPMENT_SEEDS
119
+ )
120
+ heldout_mean, _ = mean_reward_over_seeds(
121
+ heuristic_baseline_policy, scenario, HELDOUT_SEEDS
122
+ )
123
+ margin = max(2.0, 0.40 * abs(dev_mean))
124
+ assert abs(heldout_mean - dev_mean) <= margin, (
125
+ f"Baseline unstable on {scenario.name}: "
126
+ f"dev={dev_mean:.2f}, heldout={heldout_mean:.2f}, margin={margin:.2f}"
127
+ )
128
+
129
+ def test_baseline_beats_always_route_b_dev(self):
130
+ """Baseline beats always_route_b on all tasks across development seeds."""
131
+ for scenario in [EASY, MEDIUM, HARD]:
132
+ baseline_mean, _ = mean_reward_over_seeds(
133
+ heuristic_baseline_policy, scenario, DEVELOPMENT_SEEDS
134
+ )
135
+ always_b_mean, _ = mean_reward_over_seeds(
136
+ always_route_b_policy, scenario, DEVELOPMENT_SEEDS
137
+ )
138
+ assert baseline_mean >= always_b_mean, (
139
+ f"baseline ({baseline_mean:.2f}) < always_route_b ({always_b_mean:.2f}) on {scenario.name}"
140
+ )
budget_router/validation.py ADDED
@@ -0,0 +1,424 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Validation harness for the Budget Router environment.
3
+
4
+ - run_validation(): runs all policies across all tasks and seed sets
5
+ - run_manual_trace(): step-by-step debug trace
6
+ - assert_all_checks(): hard assertions that must pass before submission
7
+ - print_results_table(): formatted results display
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import math
13
+ import random
14
+ from typing import Any, Callable, Dict, List, Optional, Tuple
15
+
16
+ from .environment import BudgetRouterEnv
17
+ from .models import Action, ActionType, InternalState, Observation, TaskConfig
18
+ from .policies import (
19
+ always_route_a_policy,
20
+ always_route_b_policy,
21
+ always_route_c_policy,
22
+ always_shed_load_policy,
23
+ debug_upper_bound_policy,
24
+ heuristic_baseline_policy,
25
+ random_policy,
26
+ )
27
+ from .reward import episode_metrics
28
+ from .tasks import EASY, HARD, HARD_MULTI, MEDIUM, TASK_PRESETS
29
+
30
+ # ─── Seed sets ──────────────────────────────────────────────────────────
31
+
32
+ DEVELOPMENT_SEEDS = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
33
+ HELDOUT_SEEDS = [100, 101, 102, 103, 104]
34
+
35
+
36
+ # ─── Episode runner ─────────────────────────────────────────────────────
37
+
38
+
39
+ def run_episode(
40
+ env: BudgetRouterEnv,
41
+ policy_fn: Callable,
42
+ seed: int,
43
+ scenario: TaskConfig,
44
+ policy_name: str = "",
45
+ ) -> Dict[str, Any]:
46
+ """Run a single episode and return metrics."""
47
+ obs = env.reset(seed=seed, scenario=scenario)
48
+
49
+ # For random policy, seed a separate RNG
50
+ policy_rng = random.Random(seed + 10000) if "random" in policy_name else None
51
+
52
+ total_reward = 0.0
53
+ steps = 0
54
+
55
+ while not obs.done and steps < scenario.max_steps:
56
+ # Select action based on policy
57
+ if "upper_bound" in policy_name:
58
+ action = policy_fn(obs, env._internal)
59
+ elif "random" in policy_name:
60
+ action = policy_fn(obs, rng=policy_rng)
61
+ else:
62
+ action = policy_fn(obs)
63
+
64
+ obs = env.step(action)
65
+ total_reward += (obs.reward or 0.0)
66
+ steps += 1
67
+
68
+ metrics = episode_metrics(env._internal.history)
69
+ metrics["total_reward"] = round(total_reward, 4)
70
+ metrics["episode_length"] = steps
71
+
72
+ return metrics
73
+
74
+
75
+ # ─── Validation runner ──────────────────────────────────────────────────
76
+
77
+
78
+ def run_validation(seed_set_name: str = "development") -> Dict[str, Dict[str, Dict[str, Any]]]:
79
+ """
80
+ Run all 6 policies on all 3 tasks for the given seed set.
81
+
82
+ Returns:
83
+ Nested dict: results[task_name][policy_name] = {
84
+ "mean_reward", "std_reward", "min_reward", "max_reward",
85
+ "success_rate", "average_cost", "average_latency",
86
+ "all_rewards", "all_budgets", "all_lengths"
87
+ }
88
+ """
89
+ seeds = DEVELOPMENT_SEEDS if seed_set_name == "development" else HELDOUT_SEEDS
90
+
91
+ policies = {
92
+ "random": random_policy,
93
+ "heuristic_baseline": heuristic_baseline_policy,
94
+ "upper_bound": debug_upper_bound_policy,
95
+ "always_route_a": always_route_a_policy,
96
+ "always_route_b": always_route_b_policy,
97
+ "always_route_c": always_route_c_policy,
98
+ "always_shed_load": always_shed_load_policy,
99
+ }
100
+
101
+ tasks = {"easy": EASY, "medium": MEDIUM, "hard": HARD, "hard_multi": HARD_MULTI}
102
+ results: Dict[str, Dict[str, Dict[str, Any]]] = {}
103
+
104
+ env = BudgetRouterEnv()
105
+
106
+ for task_name, task_config in tasks.items():
107
+ results[task_name] = {}
108
+ for policy_name, policy_fn in policies.items():
109
+ all_rewards = []
110
+ all_success_rates = []
111
+ all_costs = []
112
+ all_latencies = []
113
+ all_lengths = []
114
+
115
+ for seed in seeds:
116
+ metrics = run_episode(
117
+ env, policy_fn, seed, task_config, policy_name=policy_name
118
+ )
119
+ all_rewards.append(metrics["total_reward"])
120
+ all_success_rates.append(metrics["success_rate"])
121
+ all_costs.append(metrics["total_cost_spent"])
122
+ all_latencies.append(metrics["average_latency_ms"])
123
+ all_lengths.append(metrics["episode_length"])
124
+
125
+ mean_r = sum(all_rewards) / len(all_rewards)
126
+ std_r = (
127
+ sum((r - mean_r) ** 2 for r in all_rewards) / len(all_rewards)
128
+ ) ** 0.5
129
+
130
+ results[task_name][policy_name] = {
131
+ "mean_reward": round(mean_r, 4),
132
+ "std_reward": round(std_r, 4),
133
+ "min_reward": round(min(all_rewards), 4),
134
+ "max_reward": round(max(all_rewards), 4),
135
+ "success_rate": round(
136
+ sum(all_success_rates) / len(all_success_rates), 4
137
+ ),
138
+ "average_cost": round(sum(all_costs) / len(all_costs), 4),
139
+ "average_latency": round(
140
+ sum(all_latencies) / len(all_latencies), 2
141
+ ),
142
+ "all_rewards": all_rewards,
143
+ "all_lengths": all_lengths,
144
+ }
145
+
146
+ return results
147
+
148
+
149
+ # ─── Results printer ────────────────────────────────────────────────────
150
+
151
+
152
+ def print_results_table(results: Dict, seed_set_name: str = "development") -> None:
153
+ """Print formatted results table."""
154
+ print(f"\n{'='*90}")
155
+ print(f" VALIDATION RESULTS — {seed_set_name.upper()} SEEDS")
156
+ print(f"{'='*90}")
157
+
158
+ for task_name, policies in results.items():
159
+ print(f"\n Task: {task_name.upper()}")
160
+ print(f" {'Policy':<20} {'Mean':>8} {'Std':>8} {'Min':>8} {'Max':>8} {'SucRate':>8} {'Cost':>8} {'Lat(ms)':>8}")
161
+ print(f" {'-'*76}")
162
+ for policy_name, stats in policies.items():
163
+ print(
164
+ f" {policy_name:<20} "
165
+ f"{stats['mean_reward']:>8.2f} "
166
+ f"{stats['std_reward']:>8.2f} "
167
+ f"{stats['min_reward']:>8.2f} "
168
+ f"{stats['max_reward']:>8.2f} "
169
+ f"{stats['success_rate']:>8.2f} "
170
+ f"{stats['average_cost']:>8.4f} "
171
+ f"{stats['average_latency']:>8.1f}"
172
+ )
173
+
174
+ print(f"\n{'='*90}")
175
+
176
+
177
+ # ─── Manual Trace ──────────────────────────────────────────────────────
178
+
179
+
180
+ def run_manual_trace(
181
+ seed: int = 42,
182
+ scenario_name: str = "medium",
183
+ policy_fn: Optional[Callable] = None,
184
+ policy_name: str = "heuristic_baseline",
185
+ ) -> None:
186
+ """
187
+ Run a single episode with step-by-step trace in raw internal units.
188
+ PRIMARY debugging tool.
189
+ """
190
+ scenario = TASK_PRESETS[scenario_name]
191
+ policy = policy_fn or heuristic_baseline_policy
192
+ env = BudgetRouterEnv()
193
+
194
+ obs = env.reset(seed=seed, scenario=scenario)
195
+ policy_rng = random.Random(seed + 10000)
196
+
197
+ print(f"\n{'─'*95}")
198
+ print(f" MANUAL TRACE — Scenario: {scenario_name.upper()}, Seed: {seed}, Policy: {policy_name}")
199
+ print(f"{'─'*95}")
200
+ print(
201
+ f" {'Step':>4} | {'Action':<10} | {'A_health':>8} | {'B_health':>8} | {'C_health':>8} | "
202
+ f"{'Latency':>8} | {'Budget$':>8} | {'Reward':>7} | {'Cumul':>7}"
203
+ )
204
+ print(f" {'─'*91}")
205
+
206
+ cumulative = 0.0
207
+ steps = 0
208
+
209
+ while not obs.done and steps < scenario.max_steps:
210
+ if "upper_bound" in policy_name:
211
+ action = policy(obs, env._internal)
212
+ elif "random" in policy_name:
213
+ action = policy(obs, rng=policy_rng)
214
+ else:
215
+ action = policy(obs)
216
+
217
+ obs = env.step(action)
218
+ steps += 1
219
+
220
+ reward = obs.reward or 0.0
221
+ cumulative += reward
222
+
223
+ # Read raw internal state for trace
224
+ s = env._internal
225
+ a_health = s.providers["A"].current_health
226
+ b_health = s.providers["B"].current_health
227
+ c_health = s.providers["C"].current_health
228
+ latency_ms = s.last_latency_ms
229
+ budget = s.budget_dollars
230
+
231
+ print(
232
+ f" {steps:>4} | {action.action_type.value:<10} | "
233
+ f"{a_health:>8.3f} | {b_health:>8.3f} | {c_health:>8.3f} | "
234
+ f"{latency_ms:>6.0f}ms | ${budget:>7.2f} | "
235
+ f"{reward:>+7.2f} | {cumulative:>+7.2f}"
236
+ )
237
+
238
+ print(f" {'─'*91}")
239
+
240
+ metrics = episode_metrics(env._internal.history)
241
+ print(
242
+ f" EPISODE END | "
243
+ f"success_rate={metrics['success_rate']:.2f} | "
244
+ f"total_cost=${metrics['total_cost_spent']:.4f} | "
245
+ f"sla_met={metrics['sla_met']} | "
246
+ f"total_reward={cumulative:.2f}"
247
+ )
248
+ print(f"{'─'*95}\n")
249
+
250
+
251
+ # ─── Hard Assertions ───────────────────────────────────────────────────
252
+
253
+
254
+ def assert_all_checks(
255
+ dev_results: Dict[str, Dict[str, Dict[str, Any]]],
256
+ heldout_results: Dict[str, Dict[str, Dict[str, Any]]],
257
+ ) -> None:
258
+ """
259
+ Run all hard assertions. All must pass before submission.
260
+ If any fails, fix the environment — do not weaken the assertion.
261
+ """
262
+ print("\n" + "=" * 60)
263
+ print(" RUNNING HARD ASSERTION CHECKS")
264
+ print("=" * 60)
265
+
266
+ passed = 0
267
+ failed = 0
268
+ total = 0
269
+
270
+ def check(condition: bool, msg: str) -> None:
271
+ nonlocal passed, failed, total
272
+ total += 1
273
+ if condition:
274
+ passed += 1
275
+ print(f" ✅ PASS: {msg}")
276
+ else:
277
+ failed += 1
278
+ print(f" ❌ FAIL: {msg}")
279
+
280
+ # ── Policy ordering (BOTH seed sets, ALL tasks) ──
281
+ # Note: hard_multi baseline > random only required on dev seeds —
282
+ # heldout random can occasionally beat the deterministic heuristic on hard_multi
283
+ for seed_set_name, results in [("dev", dev_results), ("heldout", heldout_results)]:
284
+ for task in ["easy", "medium", "hard"]:
285
+ baseline_mean = results[task]["heuristic_baseline"]["mean_reward"]
286
+ random_mean = results[task]["random"]["mean_reward"]
287
+ upper_bound_mean = results[task]["upper_bound"]["mean_reward"]
288
+
289
+ check(
290
+ baseline_mean > random_mean,
291
+ f"[{seed_set_name}/{task}] baseline ({baseline_mean:.2f}) > random ({random_mean:.2f})",
292
+ )
293
+ check(
294
+ upper_bound_mean >= baseline_mean,
295
+ f"[{seed_set_name}/{task}] oracle ({upper_bound_mean:.2f}) >= baseline ({baseline_mean:.2f})",
296
+ )
297
+ # hard_multi: only check oracle >= baseline (heuristic fails by design)
298
+ hm_baseline = results["hard_multi"]["heuristic_baseline"]["mean_reward"]
299
+ hm_oracle = results["hard_multi"]["upper_bound"]["mean_reward"]
300
+ check(
301
+ hm_oracle >= hm_baseline,
302
+ f"[{seed_set_name}/hard_multi] oracle ({hm_oracle:.2f}) >= baseline ({hm_baseline:.2f})",
303
+ )
304
+
305
+ # ── Non-triviality ──
306
+ found_nontrivial = False
307
+ for task in ["easy", "medium", "hard", "hard_multi"]:
308
+ baseline_mean = dev_results[task]["heuristic_baseline"]["mean_reward"]
309
+ random_mean = dev_results[task]["random"]["mean_reward"]
310
+ if abs(random_mean) > 0:
311
+ gap = (baseline_mean - random_mean) / abs(random_mean)
312
+ else:
313
+ gap = abs(baseline_mean - random_mean)
314
+ if gap > 0.20:
315
+ found_nontrivial = True
316
+ break
317
+ check(found_nontrivial, "At least one task has >20% gap between baseline and random")
318
+
319
+ # ── Solvability ──
320
+ easy_ub_reward = dev_results["easy"]["upper_bound"]["mean_reward"]
321
+ easy_ub_sr = dev_results["easy"]["upper_bound"]["success_rate"]
322
+ check(easy_ub_reward > 0, f"Oracle positive reward on easy ({easy_ub_reward:.2f})")
323
+ check(easy_ub_sr > 0.5, f"Oracle success rate on easy ({easy_ub_sr:.2f}) > 0.5")
324
+
325
+ # ── Anti-gaming checks (hard_multi excluded — heuristic fails by design) ──
326
+ for task in ["easy", "medium", "hard"]:
327
+ baseline_mean = dev_results[task]["heuristic_baseline"]["mean_reward"]
328
+ always_a_mean = dev_results[task]["always_route_a"]["mean_reward"]
329
+ always_b_mean = dev_results[task]["always_route_b"]["mean_reward"]
330
+ always_shed_mean = dev_results[task]["always_shed_load"]["mean_reward"]
331
+
332
+ check(
333
+ baseline_mean >= always_a_mean,
334
+ f"[dev/{task}] baseline ({baseline_mean:.2f}) >= always_a ({always_a_mean:.2f})",
335
+ )
336
+ check(
337
+ baseline_mean >= always_b_mean,
338
+ f"[dev/{task}] baseline ({baseline_mean:.2f}) >= always_b ({always_b_mean:.2f})",
339
+ )
340
+ check(
341
+ baseline_mean >= always_shed_mean,
342
+ f"[dev/{task}] baseline ({baseline_mean:.2f}) >= always_shed ({always_shed_mean:.2f})",
343
+ )
344
+
345
+ # Check that NOT all degenerate policies dominate baseline
346
+ for task in ["easy", "medium", "hard", "hard_multi"]:
347
+ baseline_mean = dev_results[task]["heuristic_baseline"]["mean_reward"]
348
+ always_a = dev_results[task]["always_route_a"]["mean_reward"]
349
+ always_b = dev_results[task]["always_route_b"]["mean_reward"]
350
+ always_c = dev_results[task]["always_route_c"]["mean_reward"]
351
+ always_shed = dev_results[task]["always_shed_load"]["mean_reward"]
352
+ check(
353
+ not (
354
+ always_a >= baseline_mean
355
+ and always_b >= baseline_mean
356
+ and always_c >= baseline_mean
357
+ and always_shed >= baseline_mean
358
+ ),
359
+ f"[dev/{task}] heuristic provides strategic advantage over degenerate policies",
360
+ )
361
+
362
+ # ── Held-out robustness ──
363
+ for task in ["easy", "medium", "hard", "hard_multi"]:
364
+ baseline_dev = dev_results[task]["heuristic_baseline"]["mean_reward"]
365
+ baseline_heldout = heldout_results[task]["heuristic_baseline"]["mean_reward"]
366
+ margin = max(2.0, 0.40 * abs(baseline_dev))
367
+ check(
368
+ abs(baseline_heldout - baseline_dev) <= margin,
369
+ f"[{task}] baseline stable: dev={baseline_dev:.2f}, heldout={baseline_heldout:.2f}, margin={margin:.2f}",
370
+ )
371
+
372
+ # ── Safety: NaN, budget explosion, infinite loops ──
373
+ all_rewards = []
374
+ all_lengths = []
375
+ for seed_set_name, results in [("dev", dev_results), ("heldout", heldout_results)]:
376
+ for task in ["easy", "medium", "hard"]:
377
+ for policy_name, stats in results[task].items():
378
+ all_rewards.extend(stats["all_rewards"])
379
+ all_lengths.extend(stats["all_lengths"])
380
+
381
+ check(
382
+ all(not math.isnan(r) for r in all_rewards),
383
+ f"No NaN rewards across {len(all_rewards)} episodes",
384
+ )
385
+ check(
386
+ all(ep_len <= 20 for ep_len in all_lengths),
387
+ f"No episode exceeds 20 steps (max seen: {max(all_lengths) if all_lengths else 0})",
388
+ )
389
+
390
+ # ── Summary ──
391
+ print(f"\n{'='*60}")
392
+ print(f" RESULTS: {passed}/{total} passed, {failed}/{total} failed")
393
+ print(f"{'='*60}")
394
+
395
+ if failed > 0:
396
+ print(f"\n ⚠️ {failed} assertion(s) FAILED. Fix the environment before submission.")
397
+ else:
398
+ print(f"\n 🎉 All assertions passed! Environment is ready for submission.")
399
+
400
+
401
+ # ─── Main entry point ──────────────────────────────────────────────────
402
+
403
+
404
+ def main() -> None:
405
+ """Run full validation suite."""
406
+ # Run both seed sets
407
+ print("Running validation on DEVELOPMENT seeds...")
408
+ dev_results = run_validation("development")
409
+ print_results_table(dev_results, "development")
410
+
411
+ print("\nRunning validation on HELD-OUT seeds...")
412
+ heldout_results = run_validation("heldout")
413
+ print_results_table(heldout_results, "heldout")
414
+
415
+ # Manual trace
416
+ run_manual_trace(seed=42, scenario_name="medium")
417
+ run_manual_trace(seed=42, scenario_name="hard_multi")
418
+
419
+ # Hard assertions
420
+ assert_all_checks(dev_results, heldout_results)
421
+
422
+
423
+ if __name__ == "__main__":
424
+ main()
check_leak.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ check_leak.py — Validates BudgetRouterGRPOEnv before GRPO training.
3
+
4
+ Checks:
5
+ 1. Tool methods return strings (not crash).
6
+ 2. Episode ends gracefully via ValueError (TRL-idiomatic done signal).
7
+ 3. Reward is a float in [0, 1] — not a dict, not NaN.
8
+ 4. History uses actual_degradation_start (jittered) — NOT the config constant.
9
+ This proves grade_episode() will compute correct adaptation windows.
10
+ 5. 10-step reward trajectory printed: verify no explosion/vanishing.
11
+ 6. Provider status IS present in tool responses (intentional — text interface needs it).
12
+
13
+ Run:
14
+ uv run python check_leak.py
15
+ """
16
+
17
+ import sys
18
+
19
+
20
+ def main() -> None:
21
+ try:
22
+ from train.grpo_env import BudgetRouterGRPOEnv
23
+ from budget_router.reward import grade_episode
24
+ from budget_router.tasks import HARD_MULTI
25
+ except ImportError as e:
26
+ print(f"[FAIL] Import error: {e}")
27
+ sys.exit(1)
28
+
29
+ print("=" * 60)
30
+ print("BudgetRouterGRPOEnv — Pre-training Validation")
31
+ print("=" * 60)
32
+
33
+ # ── Check 0: transformers version (soft warning — required for environment_factory) ──
34
+ print("\n[CHECK 0] transformers version (required for environment_factory)...")
35
+ try:
36
+ import transformers
37
+ ver_str = transformers.__version__
38
+ # TRL's environment_factory requires transformers >= 4.47.0 (confirmed shipping in
39
+ # stable builds as of Apr 2026). Exact minimum threshold is version-specific to TRL.
40
+ # If not installed, training will fail at import time — caught here early.
41
+ print(f" ✅ transformers=={ver_str} installed.")
42
+ # Soft check: warn if below 4.47 (minimum known to ship environment_factory support)
43
+ major, minor = int(ver_str.split(".")[0]), int(ver_str.split(".")[1])
44
+ if major < 4 or (major == 4 and minor < 47):
45
+ print(
46
+ f" ⚠️ WARNING: transformers {ver_str} may be too old for environment_factory.\n"
47
+ f" Recommended: pip install 'transformers>=4.47.0' or install from main."
48
+ )
49
+ except ImportError:
50
+ print(
51
+ " ⚠️ WARNING: transformers is NOT installed in this venv.\n"
52
+ " Install before GRPO training: pip install 'transformers>=4.47.0' trl accelerate peft"
53
+ )
54
+
55
+ # ── Check 1: reset() returns a non-empty string ─────────────────────
56
+ print("\n[CHECK 1] reset() returns rich text observation...")
57
+
58
+ env = BudgetRouterGRPOEnv()
59
+ obs_text = env.reset(scenario="hard_multi", seed=42)
60
+ assert isinstance(obs_text, str) and len(obs_text) > 10, \
61
+ f"reset() should return non-empty string, got: {obs_text!r}"
62
+ assert "Budget" in obs_text, "reset() should mention Budget"
63
+ assert "Provider" in obs_text, "reset() should include provider status (text interface, not sanitized)"
64
+ print(f" ✅ reset() returned {len(obs_text)} chars. Provider status PRESENT (correct for text interface).")
65
+ print(f" Preview: {obs_text[:120].replace(chr(10), ' ')}...")
66
+
67
+ # ── Check 2: Tool methods return strings step-by-step ───────────────
68
+ print("\n[CHECK 2] Tool methods return strings and accumulate history...")
69
+ env2 = BudgetRouterGRPOEnv()
70
+ env2.reset(scenario="hard_multi", seed=42)
71
+
72
+ step_results = []
73
+ episode_done = False
74
+ for step in range(25): # more than max_steps to test guard
75
+ action_fn = [env2.route_to_a, env2.route_to_b, env2.shed_load, env2.route_to_b][step % 4]
76
+ try:
77
+ result = action_fn()
78
+ assert isinstance(result, str), f"Tool method should return str, got {type(result)}"
79
+ step_results.append(result)
80
+ print(f" Step {step + 1:02d}: ✅ {result[:80].replace(chr(10), ' ')}...")
81
+ except ValueError as e:
82
+ episode_done = True
83
+ print(f" Step {step + 1:02d}: ✅ Episode ended via ValueError (TRL-idiomatic): {str(e)[:80]}...")
84
+ break
85
+
86
+ assert episode_done, "Episode should end with ValueError before step 25"
87
+ assert len(step_results) > 0, "At least one tool step should complete"
88
+ print(f" ✅ Episode ended correctly after {len(step_results)} tool calls.")
89
+
90
+ # ── Check 3: Reward is float in [0, 1] ──────────────────────────────
91
+ print("\n[CHECK 3] Reward is float in [0, 1]...")
92
+ assert isinstance(env2.reward, float), \
93
+ f"env.reward should be float, got {type(env2.reward)}: {env2.reward!r}"
94
+ assert 0.0 <= env2.reward <= 1.0, \
95
+ f"env.reward should be in [0, 1], got {env2.reward}"
96
+ import math
97
+ assert not math.isnan(env2.reward), "env.reward is NaN — grade_episode bug"
98
+ print(f" ✅ env.reward = {env2.reward:.4f} (float, in [0,1], not NaN)")
99
+
100
+ # ── Check 4: History uses actual jittered degradation_start_step ────
101
+ print("\n[CHECK 4] History contains jittered actual_degradation_start (not config constant)...")
102
+ history = env2._env._internal.history
103
+ assert len(history) > 0, "History should not be empty after episode"
104
+
105
+ # Read degradation_start_step from step_info (written by environment.py)
106
+ step_info_degrade_start = history[0].get("degradation_start_step")
107
+ # Read the actual jittered value from internal state
108
+ actual_jittered_start = env2._env._internal.actual_degradation_start
109
+ # Config constant for hard_multi
110
+ config_constant = HARD_MULTI.degradation_start_step # = 0
111
+
112
+ print(f" Config constant (degradation_start_step): {config_constant}")
113
+ print(f" step_info[degradation_start_step]: {step_info_degrade_start}")
114
+ print(f" internal.actual_degradation_start: {actual_jittered_start}")
115
+
116
+ assert step_info_degrade_start is not None, \
117
+ "step_info missing degradation_start_step — grade_episode() will break"
118
+ assert step_info_degrade_start == actual_jittered_start, \
119
+ (f"step_info uses wrong degradation onset! "
120
+ f"Got {step_info_degrade_start}, expected {actual_jittered_start}. "
121
+ f"This would corrupt adaptation scores in grade_episode().")
122
+ print(f" ✅ Jittered onset correctly propagated through step_info.")
123
+
124
+ # ── Check 5: grade_episode() on history returns consistent score ─────
125
+ print("\n[CHECK 5] grade_episode(history) matches env.reward...")
126
+ grader_result = grade_episode(history)
127
+ assert isinstance(grader_result, dict), "grade_episode should return dict"
128
+ grader_score = float(grader_result["overall_score"])
129
+ assert abs(grader_score - env2.reward) < 1e-6, \
130
+ f"env.reward ({env2.reward}) != grade_episode score ({grader_score}). Mismatch."
131
+ print(f" ✅ grade_episode overall_score = {grader_score:.4f}, env.reward = {env2.reward:.4f}. Match confirmed.")
132
+
133
+ # ── Check 6: 10-episode reward trajectory ────────────────────────────
134
+ print("\n[CHECK 6] 10-episode reward trajectory (hard_multi, varying seeds)...")
135
+ print(" Episode | Seed | Steps | Score | Reward-in-range")
136
+ rewards = []
137
+ for ep, seed in enumerate(range(10)):
138
+ env3 = BudgetRouterGRPOEnv()
139
+ env3.reset(scenario="hard_multi", seed=seed)
140
+ done = False
141
+ steps = 0
142
+ while not done and steps < 30:
143
+ # Alternate actions: A, B, A, B... (simple test policy)
144
+ action_fn = env3.route_to_a if steps % 2 == 0 else env3.route_to_b
145
+ try:
146
+ action_fn()
147
+ steps += 1
148
+ except ValueError:
149
+ done = True
150
+ reward = env3.reward
151
+ rewards.append(reward)
152
+ in_range = "✅" if 0.0 <= reward <= 1.0 else "❌"
153
+ print(f" Ep {ep+1:02d} | {seed:4d} | {steps:5d} | {reward:.4f} | {in_range}")
154
+
155
+ import statistics
156
+ if len(rewards) > 1:
157
+ std = statistics.stdev(rewards)
158
+ mean = statistics.mean(rewards)
159
+ print(f"\n Mean reward: {mean:.4f} | Std: {std:.4f}")
160
+ if std < 0.03:
161
+ print(
162
+ f" ⚠️ WARNING: Low reward variance (std={std:.4f}). GRPO may get weak gradient signal.\n"
163
+ f" Mitigation: Use num_generations=8, hard_multi scenario, and a small LLM\n"
164
+ f" at initialization that makes diverse routing decisions."
165
+ )
166
+ else:
167
+ print(f" ✅ Reward variance is sufficient for GRPO learning (std={std:.4f} > 0.03).")
168
+
169
+ print("\n" + "=" * 60)
170
+ print("✅ ALL CHECKS PASSED — BudgetRouterGRPOEnv is ready for GRPO training.")
171
+ print("=" * 60)
172
+ print("\nRecommended training config (Mac MPS / Colab):")
173
+ print(" scenario: hard_multi")
174
+ print(" num_generations: 8")
175
+ print(" model: Qwen2.5-1.5B (Mac 16GB) / Qwen2.5-7B (Colab T4)")
176
+ print(" Mac: TRL + PyTorch MPS (set PYTORCH_ENABLE_MPS_FALLBACK=1)")
177
+ print(" Colab: Unsloth + vLLM on NVIDIA T4/A100")
178
+
179
+
180
+ if __name__ == "__main__":
181
+ main()
client.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from budget_router.client import BudgetRouterClient
2
+
3
+ __all__ = ["BudgetRouterClient"]
eval/eval_all.py ADDED
@@ -0,0 +1,306 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ eval_all.py — Budget Router Consolidated Evaluator
4
+ ====================================================
5
+ Runs heuristic + LLM (+ optional PPO) across all tasks and seeds.
6
+ Outputs a Markdown table + per-episode JSON to outputs/.
7
+
8
+ Usage:
9
+ # Quick (3 seeds, heuristic + LLM):
10
+ uv run python eval_all.py
11
+
12
+ # Full (10 seeds, all policies):
13
+ uv run python eval_all.py --seeds 10 --policies heuristic llm
14
+
15
+ # Heuristic only (no API needed):
16
+ uv run python eval_all.py --policies heuristic
17
+
18
+ # Specific tasks:
19
+ uv run python eval_all.py --tasks hard hard_multi --seeds 5
20
+
21
+ # Explicit fresh seed bucket:
22
+ uv run python eval_all.py --tasks hard_multi --seed-values "200,201,202"
23
+
24
+ Prerequisites:
25
+ export HF_TOKEN=<your_hf_token> # required for LLM policy
26
+ export API_BASE_URL=https://router.huggingface.co/v1 # default
27
+ export MODEL_NAME=Qwen/Qwen2.5-72B-Instruct # default
28
+
29
+ Output:
30
+ outputs/eval_results_<timestamp>.json — full per-episode data
31
+ outputs/eval_summary_<timestamp>.md — markdown table for README
32
+ """
33
+
34
+ import json
35
+ import os
36
+ import sys
37
+ from datetime import datetime
38
+ from pathlib import Path
39
+ from typing import Dict, List, Optional
40
+
41
+ import typer
42
+
43
+ # ── Add parent to path so we can import budget_router ──────────────────────
44
+ sys.path.insert(0, str(Path(__file__).parent))
45
+
46
+ from budget_router.environment import BudgetRouterEnv
47
+ from budget_router.models import Action, ActionType, Observation, TaskConfig
48
+ from budget_router.policies import heuristic_baseline_policy
49
+ from budget_router.reward import episode_metrics, grade_episode
50
+ from budget_router.tasks import EASY, HARD, HARD_MULTI, MEDIUM
51
+
52
+ from inference import LLMRouter
53
+
54
+ # ── Config ──────────────────────────────────────────────────────────────────
55
+
56
+ TASKS: Dict[str, TaskConfig] = {
57
+ "easy": EASY,
58
+ "medium": MEDIUM,
59
+ "hard": HARD,
60
+ "hard_multi": HARD_MULTI,
61
+ }
62
+
63
+ SEED_SETS: Dict[str, List[int]] = {
64
+ "dev": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
65
+ "heldout": [100, 101, 102, 103, 104, 105, 106, 107, 108, 109],
66
+ }
67
+
68
+ API_KEY = os.getenv("API_KEY") or os.getenv("HF_TOKEN")
69
+ API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
70
+ MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
71
+ LLM_LOG_RAW = (os.getenv("LLM_LOG_RAW") or "").strip().lower() in {"1", "true", "yes", "y", "on"}
72
+ LLM_LOG_RAW_MAX_CHARS = int(os.getenv("LLM_LOG_RAW_MAX_CHARS") or "220")
73
+
74
+
75
+ def select_seeds(seed_set: str, seeds: int, seed_values: Optional[str] = None) -> List[int]:
76
+ """Resolve either a named seed set or an explicit comma/space-separated seed list."""
77
+ if seed_values is not None:
78
+ parsed = [int(part) for part in seed_values.replace(",", " ").split()]
79
+ if not parsed:
80
+ raise ValueError("No explicit seeds provided in --seed-values")
81
+ return parsed
82
+
83
+ if seed_set not in SEED_SETS:
84
+ raise ValueError(f"Unknown seed set: {seed_set}. Choose from: {list(SEED_SETS)}")
85
+
86
+ named_seeds = SEED_SETS[seed_set]
87
+ return named_seeds[:max(1, min(seeds, len(named_seeds)))]
88
+
89
+
90
+ def _single_line(value: str | None) -> str:
91
+ if not value:
92
+ return "null"
93
+ return str(value).replace("\n", " ").replace("\r", " ")
94
+
95
+
96
+ def _truncate(value: str | None, max_chars: int) -> str:
97
+ s = _single_line(value).strip()
98
+ if len(s) <= max_chars:
99
+ return s
100
+ return s[: max(0, max_chars - 3)] + "..."
101
+
102
+
103
+ # ── Policies ────────────────────────────────────────────────────────────────
104
+ def _llm_choose_action(policy: LLMRouter, obs: Observation) -> str:
105
+ action = policy.choose_action(obs)
106
+ return action.action_type.value
107
+
108
+
109
+ def _heuristic(obs: Observation) -> str:
110
+ return heuristic_baseline_policy(obs).action_type.value
111
+
112
+
113
+ # ── Episode runner ───────────────────────────────────────────────────────────
114
+
115
+ def run_one_episode(
116
+ task_name: str,
117
+ task_cfg: TaskConfig,
118
+ seed: int,
119
+ policy_name: str,
120
+ policy, # callable or LLMPolicy
121
+ ) -> Dict:
122
+ env = BudgetRouterEnv()
123
+ if policy_name == "llm":
124
+ policy.reset(task_name=task_name)
125
+
126
+ obs = env.reset(seed=seed, scenario=task_cfg)
127
+ rewards = []
128
+ actions = []
129
+
130
+ while not obs.done:
131
+ if policy_name == "heuristic":
132
+ action_str = _heuristic(obs)
133
+ else:
134
+ action_str = _llm_choose_action(policy, obs)
135
+
136
+ obs = env.step(Action(action_type=ActionType(action_str)))
137
+ reward = float(obs.reward or 0.0)
138
+ rewards.append(reward)
139
+ actions.append(action_str)
140
+ if policy_name == "llm" and LLM_LOG_RAW:
141
+ llm_raw = getattr(policy, "last_raw_output", None)
142
+ llm_parsed = getattr(policy, "last_parsed_action", None)
143
+ typer.echo(
144
+ f"[LLM] step={env._internal.current_step} action={action_str} "
145
+ f"reward={reward:+.2f} llm_raw={_truncate(llm_raw, max(20, LLM_LOG_RAW_MAX_CHARS))} "
146
+ f"llm_parsed={_single_line(llm_parsed)}"
147
+ )
148
+
149
+ grader = grade_episode(env._internal.history)
150
+ metrics = episode_metrics(env._internal.history)
151
+
152
+ return {
153
+ "task": task_name,
154
+ "seed": seed,
155
+ "policy": policy_name,
156
+ "total_reward": round(sum(rewards), 4),
157
+ "grader_score": round(grader["overall_score"], 4),
158
+ "success_score": round(grader["success_score"], 4),
159
+ "budget_score": round(grader["budget_score"], 4),
160
+ "adaptation_score": round(grader["adaptation_score"], 4),
161
+ "latency_score": round(grader["latency_score"], 4),
162
+ "sla_score": round(grader["sla_score"], 4),
163
+ "success_rate": round(metrics["success_rate"], 4),
164
+ "steps": len(rewards),
165
+ "actions": actions,
166
+ "rewards": rewards,
167
+ }
168
+
169
+
170
+ # ── Summary helpers ──────────────────────────────────────────────────────────
171
+
172
+ def _mean(vals: List[float]) -> float:
173
+ return round(sum(vals) / len(vals), 4) if vals else 0.0
174
+
175
+
176
+ def build_summary(results: List[Dict]) -> Dict:
177
+ summary = {}
178
+ for r in results:
179
+ key = (r["task"], r["policy"])
180
+ summary.setdefault(key, []).append(r)
181
+ return {
182
+ f"{task}|{pol}": {
183
+ "grader_mean": _mean([e["grader_score"] for e in eps]),
184
+ "reward_mean": _mean([e["total_reward"] for e in eps]),
185
+ "success_rate": _mean([e["success_rate"] for e in eps]),
186
+ "adaptation": _mean([e["adaptation_score"] for e in eps]),
187
+ "n": len(eps),
188
+ }
189
+ for (task, pol), eps in summary.items()
190
+ }
191
+
192
+
193
+ def render_markdown_table(summary: Dict, policies: List[str], tasks: List[str]) -> str:
194
+ task_labels = {"easy": "Easy", "medium": "Medium", "hard": "Hard", "hard_multi": "Hard_Multi"}
195
+ pol_headers = " | ".join(f"{p.upper()} Grader" for p in policies)
196
+ lines = [
197
+ f"| Task | {pol_headers} | Notes |",
198
+ "|" + "---|" * (len(policies) + 2),
199
+ ]
200
+ for task in tasks:
201
+ scores = []
202
+ for p in policies:
203
+ key = f"{task}|{p}"
204
+ s = summary.get(key, {})
205
+ if s:
206
+ n = s["n"]
207
+ scores.append(f"{s['grader_mean']:.4f} (n={n})")
208
+ else:
209
+ scores.append("—")
210
+ note = ""
211
+ if task == "hard_multi" and len(policies) >= 2:
212
+ k0 = f"{task}|{policies[0]}"
213
+ k1 = f"{task}|{policies[1]}"
214
+ if k0 in summary and k1 in summary:
215
+ diff = summary[k1]["grader_mean"] - summary[k0]["grader_mean"]
216
+ if diff > 0:
217
+ note = f"LLM +{diff*100:.1f} points vs heuristic"
218
+ line = f"| {task_labels.get(task, task)} | {' | '.join(scores)} | {note} |"
219
+ lines.append(line)
220
+ return "\n".join(lines)
221
+
222
+
223
+ # ── CLI ──────────────────────────────────────────────────────────────────────
224
+
225
+ app = typer.Typer(add_completion=False)
226
+
227
+
228
+ @app.command()
229
+ def main(
230
+ policies: List[str] = typer.Option(["heuristic", "llm"], help="Policies to run"),
231
+ tasks: List[str] = typer.Option(["easy", "medium", "hard", "hard_multi"], help="Tasks"),
232
+ seeds: int = typer.Option(3, help="Number of dev seeds (1-10, costs scale with LLM)"),
233
+ seed_set: str = typer.Option("dev", help="Seed set: dev | heldout"),
234
+ seed_values: Optional[str] = typer.Option(None, help="Explicit comma/space-separated seeds; overrides --seed-set/--seeds"),
235
+ out_dir: Path = typer.Option(Path("outputs"), help="Output directory"),
236
+ ) -> None:
237
+ """Run Budget Router evaluation across policies, tasks, and seeds."""
238
+ out_dir.mkdir(parents=True, exist_ok=True)
239
+ ts = datetime.now().strftime("%Y%m%d_%H%M%S")
240
+
241
+ try:
242
+ selected_seeds = select_seeds(seed_set=seed_set, seeds=seeds, seed_values=seed_values)
243
+ except ValueError as e:
244
+ typer.echo(str(e), err=True)
245
+ raise typer.Exit(1) from e
246
+ selected_tasks = {t: TASKS[t] for t in tasks if t in TASKS}
247
+
248
+ if not selected_tasks:
249
+ typer.echo(f"No valid tasks. Choose from: {list(TASKS)}", err=True)
250
+ raise typer.Exit(1)
251
+
252
+ # Build policy instances
253
+ policy_instances = {}
254
+ for p in policies:
255
+ if p == "heuristic":
256
+ policy_instances["heuristic"] = None # uses _heuristic() directly
257
+ elif p == "llm":
258
+ try:
259
+ if not API_KEY:
260
+ raise RuntimeError("No API key found. Set HF_TOKEN or API_KEY env var.")
261
+ policy_instances["llm"] = LLMRouter(
262
+ api_base_url=API_BASE_URL, model_name=MODEL_NAME, api_key=API_KEY
263
+ )
264
+ typer.echo(f"LLM policy: {MODEL_NAME} via {API_BASE_URL}")
265
+ except RuntimeError as e:
266
+ typer.echo(f"[WARN] LLM policy unavailable: {e} — skipping", err=True)
267
+ elif p == "ppo":
268
+ typer.echo("[WARN] PPO eval not yet wired in this script — run your train_ppo.py separately", err=True)
269
+
270
+ all_results = []
271
+ total_episodes = len(policy_instances) * len(selected_tasks) * len(selected_seeds)
272
+ done = 0
273
+
274
+ for pol_name, pol_obj in policy_instances.items():
275
+ for task_name, task_cfg in selected_tasks.items():
276
+ for seed in selected_seeds:
277
+ typer.echo(f"[{done+1}/{total_episodes}] {pol_name:10s} | {task_name:12s} | seed={seed} ...", nl=False)
278
+ try:
279
+ result = run_one_episode(task_name, task_cfg, seed, pol_name, pol_obj)
280
+ all_results.append(result)
281
+ typer.echo(f" grader={result['grader_score']:.4f} reward={result['total_reward']:+.2f}")
282
+ except Exception as e:
283
+ typer.echo(f" ERROR: {e}", err=True)
284
+ done += 1
285
+
286
+ if not all_results:
287
+ typer.echo("No results produced.", err=True)
288
+ raise typer.Exit(1)
289
+
290
+ # Save JSON
291
+ json_path = out_dir / f"eval_results_{ts}.json"
292
+ summary = build_summary(all_results)
293
+ output = {"metadata": {"timestamp": ts, "policies": policies, "tasks": tasks, "seeds": selected_seeds}, "summary": summary, "episodes": all_results}
294
+ json_path.write_text(json.dumps(output, indent=2))
295
+ typer.echo(f"\nResults saved to {json_path}")
296
+
297
+ # Save markdown table
298
+ md_table = render_markdown_table(summary, list(policy_instances.keys()), list(selected_tasks.keys()))
299
+ md_path = out_dir / f"eval_summary_{ts}.md"
300
+ md_path.write_text(f"# Budget Router Evaluation — {ts}\n\n{md_table}\n")
301
+ typer.echo(f"Markdown table saved to {md_path}")
302
+ typer.echo(f"\n{md_table}")
303
+
304
+
305
+ if __name__ == "__main__":
306
+ app()
eval/eval_all.sh ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # eval_all.sh — Budget Router Evaluator Wrapper
3
+ # ==============================================
4
+ # Runs heuristic + LLM eval and saves results to outputs/.
5
+ #
6
+ # Usage:
7
+ # chmod +x eval_all.sh
8
+ # ./eval_all.sh # quick: 3 seeds, heuristic + LLM
9
+ # ./eval_all.sh --seeds 10 # full dev set
10
+ # ./eval_all.sh --policies heuristic # no LLM (no API needed)
11
+ # ./eval_all.sh --tasks hard hard_multi --seeds 5
12
+ #
13
+ # Prerequisites:
14
+ # export HF_TOKEN=<your_huggingface_token>
15
+ # export API_BASE_URL=https://router.huggingface.co/v1 (default)
16
+ # export MODEL_NAME=Qwen/Qwen2.5-72B-Instruct (default)
17
+ # uv or pip install -e . (to install budget_router package)
18
+ #
19
+ # Outputs (in outputs/ directory):
20
+ # eval_results_<timestamp>.json — full per-episode grader breakdown
21
+ # eval_summary_<timestamp>.md — markdown table ready for README
22
+
23
+ set -euo pipefail
24
+
25
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
26
+ REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
27
+
28
+ # ── Defaults ────────────────────────────────────────────────────────────────
29
+ SEEDS=3
30
+ POLICIES="heuristic llm"
31
+ TASKS="easy medium hard hard_multi"
32
+ SEED_SET="dev"
33
+ OUT_DIR="$REPO_ROOT/outputs"
34
+ EXTRA_ARGS=()
35
+
36
+ # ── Parse CLI args ──────────────────────────────────────────────────────────
37
+ while [[ $# -gt 0 ]]; do
38
+ case "$1" in
39
+ --seeds) SEEDS="$2"; shift 2 ;;
40
+ --seed-set) SEED_SET="$2"; shift 2 ;;
41
+ --out-dir) OUT_DIR="$2"; shift 2 ;;
42
+ --policies)
43
+ POLICIES=""
44
+ shift
45
+ while [[ $# -gt 0 && ! "$1" =~ ^-- ]]; do
46
+ POLICIES="$POLICIES $1"; shift
47
+ done
48
+ ;;
49
+ --tasks)
50
+ TASKS=""
51
+ shift
52
+ while [[ $# -gt 0 && ! "$1" =~ ^-- ]]; do
53
+ TASKS="$TASKS $1"; shift
54
+ done
55
+ ;;
56
+ *) EXTRA_ARGS+=("$1"); shift ;;
57
+ esac
58
+ done
59
+
60
+ # ── Validate environment ─────────────────────────────────────────────────────
61
+ echo ""
62
+ echo "╔══════════════════════════════════════════════╗"
63
+ echo "║ Budget Router Evaluator ║"
64
+ echo "╚══════════════════════════════════════════════╝"
65
+ echo ""
66
+ echo "Config:"
67
+ echo " Policies: $POLICIES"
68
+ echo " Tasks: $TASKS"
69
+ echo " Seeds: $SEEDS (seed_set=$SEED_SET)"
70
+ echo " Output: $OUT_DIR/"
71
+ echo ""
72
+
73
+ # Check HF_TOKEN if LLM in policies
74
+ if echo "$POLICIES" | grep -q "llm"; then
75
+ if [[ -z "${HF_TOKEN:-}" && -z "${API_KEY:-}" ]]; then
76
+ echo "⚠️ WARNING: HF_TOKEN and API_KEY not set."
77
+ echo " LLM policy will be skipped. Set HF_TOKEN to enable."
78
+ echo ""
79
+ else
80
+ TOKEN_PREVIEW="${HF_TOKEN:-${API_KEY:-}}"
81
+ echo " API key: ${TOKEN_PREVIEW:0:8}... (${#TOKEN_PREVIEW} chars)"
82
+ echo " Model: ${MODEL_NAME:-Qwen/Qwen2.5-72B-Instruct}"
83
+ echo " Endpoint: ${API_BASE_URL:-https://router.huggingface.co/v1}"
84
+ echo ""
85
+ fi
86
+ fi
87
+
88
+ # ── Build typer args ─────────────────────────────────────────────────────────
89
+ TYPER_ARGS=(
90
+ "--seeds" "$SEEDS"
91
+ "--seed-set" "$SEED_SET"
92
+ "--out-dir" "$OUT_DIR"
93
+ )
94
+
95
+ for p in $POLICIES; do
96
+ TYPER_ARGS+=("--policies" "$p")
97
+ done
98
+
99
+ for t in $TASKS; do
100
+ TYPER_ARGS+=("--tasks" "$t")
101
+ done
102
+
103
+ # ── Run ──────────────────────────────────────────────────────────────────────
104
+ cd "$SCRIPT_DIR"
105
+
106
+ if command -v uv &>/dev/null; then
107
+ uv run python eval_all.py "${TYPER_ARGS[@]}" "${EXTRA_ARGS[@]+"${EXTRA_ARGS[@]}"}"
108
+ elif command -v python3 &>/dev/null; then
109
+ python3 eval_all.py "${TYPER_ARGS[@]}" "${EXTRA_ARGS[@]+"${EXTRA_ARGS[@]}"}"
110
+ else
111
+ echo "Error: neither uv nor python3 found." >&2
112
+ exit 1
113
+ fi
114
+
115
+ echo ""
116
+ echo "✅ Evaluation complete. Results in $OUT_DIR/"
eval/outputs/prompt_audit/belief_v1_dev10/eval_results_20260425_160429.json ADDED
@@ -0,0 +1,1188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "timestamp": "20260425_160429",
4
+ "policies": [
5
+ "heuristic",
6
+ "llm"
7
+ ],
8
+ "tasks": [
9
+ "hard_multi"
10
+ ],
11
+ "seeds": [
12
+ 0,
13
+ 1,
14
+ 2,
15
+ 3,
16
+ 4,
17
+ 5,
18
+ 6,
19
+ 7,
20
+ 8,
21
+ 9
22
+ ]
23
+ },
24
+ "summary": {
25
+ "hard_multi|heuristic": {
26
+ "grader_mean": 0.6078,
27
+ "reward_mean": -2.9709,
28
+ "success_rate": 0.6998,
29
+ "adaptation": 0.6907,
30
+ "n": 10
31
+ },
32
+ "hard_multi|llm": {
33
+ "grader_mean": 0.6218,
34
+ "reward_mean": 1.3455,
35
+ "success_rate": 0.8535,
36
+ "adaptation": 0.8635,
37
+ "n": 10
38
+ }
39
+ },
40
+ "episodes": [
41
+ {
42
+ "task": "hard_multi",
43
+ "seed": 0,
44
+ "policy": "heuristic",
45
+ "total_reward": -4.4659,
46
+ "grader_score": 0.5569,
47
+ "success_score": 0.65,
48
+ "budget_score": 0.0364,
49
+ "adaptation_score": 0.6032,
50
+ "latency_score": 0.4686,
51
+ "sla_score": 0.9474,
52
+ "success_rate": 0.6842,
53
+ "steps": 20,
54
+ "actions": [
55
+ "route_to_a",
56
+ "route_to_a",
57
+ "route_to_a",
58
+ "route_to_a",
59
+ "route_to_a",
60
+ "route_to_a",
61
+ "route_to_b",
62
+ "route_to_b",
63
+ "route_to_b",
64
+ "route_to_b",
65
+ "route_to_b",
66
+ "route_to_b",
67
+ "route_to_c",
68
+ "route_to_c",
69
+ "route_to_c",
70
+ "route_to_c",
71
+ "route_to_c",
72
+ "route_to_c",
73
+ "route_to_c",
74
+ "shed_load"
75
+ ],
76
+ "rewards": [
77
+ 0.9545454545454546,
78
+ 0.9545454545454546,
79
+ 0.9545454545454546,
80
+ -2.0454545454545454,
81
+ -2.0454545454545454,
82
+ -2.0454545454545454,
83
+ 0.7727272727272727,
84
+ 0.7727272727272727,
85
+ 0.7727272727272727,
86
+ -2.2272727272727275,
87
+ -2.2272727272727275,
88
+ -2.3750364951788474,
89
+ 0.5454545454545454,
90
+ 0.5454545454545454,
91
+ 0.5454545454545454,
92
+ 0.5454545454545454,
93
+ 0.5454545454545454,
94
+ 0.5454545454545454,
95
+ 0.5454545454545454,
96
+ -0.5
97
+ ]
98
+ },
99
+ {
100
+ "task": "hard_multi",
101
+ "seed": 1,
102
+ "policy": "heuristic",
103
+ "total_reward": -2.7727,
104
+ "grader_score": 0.6077,
105
+ "success_score": 0.7,
106
+ "budget_score": 0.0455,
107
+ "adaptation_score": 0.6833,
108
+ "latency_score": 0.5213,
109
+ "sla_score": 1.0,
110
+ "success_rate": 0.7,
111
+ "steps": 20,
112
+ "actions": [
113
+ "route_to_a",
114
+ "route_to_a",
115
+ "route_to_a",
116
+ "route_to_a",
117
+ "route_to_a",
118
+ "route_to_b",
119
+ "route_to_b",
120
+ "route_to_b",
121
+ "route_to_b",
122
+ "route_to_b",
123
+ "route_to_b",
124
+ "route_to_b",
125
+ "route_to_b",
126
+ "route_to_b",
127
+ "route_to_b",
128
+ "route_to_c",
129
+ "route_to_c",
130
+ "route_to_c",
131
+ "route_to_c",
132
+ "route_to_c"
133
+ ],
134
+ "rewards": [
135
+ 0.9545454545454546,
136
+ -2.0454545454545454,
137
+ 0.9545454545454546,
138
+ -2.0454545454545454,
139
+ -2.0454545454545454,
140
+ 0.7727272727272727,
141
+ 0.7727272727272727,
142
+ 0.7727272727272727,
143
+ 0.7727272727272727,
144
+ 0.7727272727272727,
145
+ 0.7727272727272727,
146
+ 0.7727272727272727,
147
+ -2.2272727272727275,
148
+ -2.2272727272727275,
149
+ -2.2272727272727275,
150
+ 0.5454545454545454,
151
+ 0.5454545454545454,
152
+ 0.5454545454545454,
153
+ 0.5454545454545454,
154
+ 0.5454545454545454
155
+ ]
156
+ },
157
+ {
158
+ "task": "hard_multi",
159
+ "seed": 2,
160
+ "policy": "heuristic",
161
+ "total_reward": -2.0,
162
+ "grader_score": 0.6165,
163
+ "success_score": 0.7,
164
+ "budget_score": 0.2,
165
+ "adaptation_score": 0.6357,
166
+ "latency_score": 0.4967,
167
+ "sla_score": 1.0,
168
+ "success_rate": 0.7,
169
+ "steps": 20,
170
+ "actions": [
171
+ "route_to_a",
172
+ "route_to_a",
173
+ "route_to_a",
174
+ "route_to_a",
175
+ "route_to_a",
176
+ "route_to_a",
177
+ "route_to_a",
178
+ "route_to_a",
179
+ "route_to_b",
180
+ "route_to_b",
181
+ "route_to_b",
182
+ "route_to_b",
183
+ "route_to_b",
184
+ "route_to_b",
185
+ "route_to_b",
186
+ "route_to_b",
187
+ "route_to_c",
188
+ "route_to_c",
189
+ "route_to_c",
190
+ "route_to_c"
191
+ ],
192
+ "rewards": [
193
+ 0.9545454545454546,
194
+ 0.9545454545454546,
195
+ 0.9545454545454546,
196
+ 0.9545454545454546,
197
+ -2.0454545454545454,
198
+ -2.0454545454545454,
199
+ 0.9545454545454546,
200
+ -2.0454545454545454,
201
+ 0.7727272727272727,
202
+ 0.7727272727272727,
203
+ 0.7727272727272727,
204
+ 0.7727272727272727,
205
+ 0.7727272727272727,
206
+ -2.2272727272727275,
207
+ -2.2272727272727275,
208
+ -2.2272727272727275,
209
+ 0.5454545454545454,
210
+ 0.5454545454545454,
211
+ 0.5454545454545454,
212
+ 0.5454545454545454
213
+ ]
214
+ },
215
+ {
216
+ "task": "hard_multi",
217
+ "seed": 3,
218
+ "policy": "heuristic",
219
+ "total_reward": -1.9895,
220
+ "grader_score": 0.6289,
221
+ "success_score": 0.7,
222
+ "budget_score": 0.2091,
223
+ "adaptation_score": 0.6833,
224
+ "latency_score": 0.5416,
225
+ "sla_score": 0.95,
226
+ "success_rate": 0.7,
227
+ "steps": 20,
228
+ "actions": [
229
+ "route_to_a",
230
+ "route_to_a",
231
+ "route_to_a",
232
+ "route_to_a",
233
+ "route_to_a",
234
+ "route_to_a",
235
+ "route_to_a",
236
+ "route_to_b",
237
+ "route_to_b",
238
+ "route_to_b",
239
+ "route_to_b",
240
+ "route_to_b",
241
+ "route_to_b",
242
+ "route_to_b",
243
+ "route_to_b",
244
+ "route_to_b",
245
+ "route_to_b",
246
+ "route_to_c",
247
+ "route_to_c",
248
+ "route_to_c"
249
+ ],
250
+ "rewards": [
251
+ 0.9545454545454546,
252
+ 0.9545454545454546,
253
+ 0.9545454545454546,
254
+ -2.0454545454545454,
255
+ 0.9545454545454546,
256
+ -2.0454545454545454,
257
+ -2.0454545454545454,
258
+ 0.7727272727272727,
259
+ 0.7727272727272727,
260
+ 0.7727272727272727,
261
+ 0.7727272727272727,
262
+ 0.7727272727272727,
263
+ 0.7727272727272727,
264
+ 0.7727272727272727,
265
+ -2.2272727272727275,
266
+ -2.262190038025986,
267
+ -2.2272727272727275,
268
+ 0.5454545454545454,
269
+ 0.5454545454545454,
270
+ 0.5454545454545454
271
+ ]
272
+ },
273
+ {
274
+ "task": "hard_multi",
275
+ "seed": 4,
276
+ "policy": "heuristic",
277
+ "total_reward": -4.0909,
278
+ "grader_score": 0.5933,
279
+ "success_score": 0.65,
280
+ "budget_score": 0.0818,
281
+ "adaptation_score": 0.6625,
282
+ "latency_score": 0.5175,
283
+ "sla_score": 1.0,
284
+ "success_rate": 0.6842,
285
+ "steps": 20,
286
+ "actions": [
287
+ "route_to_a",
288
+ "route_to_a",
289
+ "route_to_a",
290
+ "route_to_a",
291
+ "route_to_a",
292
+ "route_to_a",
293
+ "route_to_b",
294
+ "route_to_b",
295
+ "route_to_b",
296
+ "route_to_b",
297
+ "route_to_b",
298
+ "route_to_b",
299
+ "route_to_b",
300
+ "route_to_c",
301
+ "route_to_c",
302
+ "route_to_c",
303
+ "route_to_c",
304
+ "route_to_c",
305
+ "route_to_c",
306
+ "shed_load"
307
+ ],
308
+ "rewards": [
309
+ 0.9545454545454546,
310
+ -2.0454545454545454,
311
+ 0.9545454545454546,
312
+ 0.9545454545454546,
313
+ -2.0454545454545454,
314
+ -2.0454545454545454,
315
+ 0.7727272727272727,
316
+ 0.7727272727272727,
317
+ 0.7727272727272727,
318
+ -2.2272727272727275,
319
+ 0.7727272727272727,
320
+ -2.2272727272727275,
321
+ -2.2272727272727275,
322
+ 0.5454545454545454,
323
+ 0.5454545454545454,
324
+ 0.5454545454545454,
325
+ 0.5454545454545454,
326
+ 0.5454545454545454,
327
+ 0.5454545454545454,
328
+ -0.5
329
+ ]
330
+ },
331
+ {
332
+ "task": "hard_multi",
333
+ "seed": 5,
334
+ "policy": "heuristic",
335
+ "total_reward": -1.4024,
336
+ "grader_score": 0.607,
337
+ "success_score": 0.65,
338
+ "budget_score": 0.0364,
339
+ "adaptation_score": 0.8125,
340
+ "latency_score": 0.5142,
341
+ "sla_score": 0.9412,
342
+ "success_rate": 0.7647,
343
+ "steps": 20,
344
+ "actions": [
345
+ "route_to_a",
346
+ "route_to_b",
347
+ "route_to_b",
348
+ "route_to_b",
349
+ "route_to_b",
350
+ "route_to_b",
351
+ "route_to_b",
352
+ "route_to_b",
353
+ "route_to_b",
354
+ "route_to_b",
355
+ "route_to_b",
356
+ "route_to_b",
357
+ "route_to_c",
358
+ "route_to_c",
359
+ "route_to_c",
360
+ "route_to_c",
361
+ "route_to_c",
362
+ "shed_load",
363
+ "shed_load",
364
+ "shed_load"
365
+ ],
366
+ "rewards": [
367
+ -2.0454545454545454,
368
+ 0.7727272727272727,
369
+ 0.7727272727272727,
370
+ 0.7727272727272727,
371
+ 0.7727272727272727,
372
+ 0.7727272727272727,
373
+ 0.7727272727272727,
374
+ 0.7727272727272727,
375
+ -2.2272727272727275,
376
+ 0.7727272727272727,
377
+ -2.2272727272727275,
378
+ -2.311463428136077,
379
+ 0.5454545454545454,
380
+ 0.5454545454545454,
381
+ 0.5454545454545454,
382
+ 0.5454545454545454,
383
+ 0.5454545454545454,
384
+ -0.5,
385
+ -0.5,
386
+ -0.5
387
+ ]
388
+ },
389
+ {
390
+ "task": "hard_multi",
391
+ "seed": 6,
392
+ "policy": "heuristic",
393
+ "total_reward": -3.7273,
394
+ "grader_score": 0.6546,
395
+ "success_score": 0.65,
396
+ "budget_score": 0.4545,
397
+ "adaptation_score": 0.6458,
398
+ "latency_score": 0.5611,
399
+ "sla_score": 1.0,
400
+ "success_rate": 0.65,
401
+ "steps": 20,
402
+ "actions": [
403
+ "route_to_a",
404
+ "route_to_a",
405
+ "route_to_a",
406
+ "route_to_a",
407
+ "route_to_a",
408
+ "route_to_a",
409
+ "route_to_a",
410
+ "route_to_a",
411
+ "route_to_a",
412
+ "route_to_a",
413
+ "route_to_b",
414
+ "route_to_b",
415
+ "route_to_b",
416
+ "route_to_b",
417
+ "route_to_b",
418
+ "route_to_b",
419
+ "route_to_b",
420
+ "route_to_b",
421
+ "route_to_b",
422
+ "route_to_b"
423
+ ],
424
+ "rewards": [
425
+ 0.9545454545454546,
426
+ 0.9545454545454546,
427
+ -2.0454545454545454,
428
+ 0.9545454545454546,
429
+ 0.9545454545454546,
430
+ 0.9545454545454546,
431
+ 0.9545454545454546,
432
+ -2.0454545454545454,
433
+ -2.0454545454545454,
434
+ -2.0454545454545454,
435
+ 0.7727272727272727,
436
+ 0.7727272727272727,
437
+ 0.7727272727272727,
438
+ 0.7727272727272727,
439
+ 0.7727272727272727,
440
+ 0.7727272727272727,
441
+ -2.2272727272727275,
442
+ -2.2272727272727275,
443
+ 0.7727272727272727,
444
+ -2.2272727272727275
445
+ ]
446
+ },
447
+ {
448
+ "task": "hard_multi",
449
+ "seed": 7,
450
+ "policy": "heuristic",
451
+ "total_reward": 0.1818,
452
+ "grader_score": 0.6477,
453
+ "success_score": 0.7,
454
+ "budget_score": 0.0364,
455
+ "adaptation_score": 0.85,
456
+ "latency_score": 0.5613,
457
+ "sla_score": 1.0,
458
+ "success_rate": 0.7778,
459
+ "steps": 20,
460
+ "actions": [
461
+ "route_to_a",
462
+ "route_to_b",
463
+ "route_to_b",
464
+ "route_to_b",
465
+ "route_to_b",
466
+ "route_to_b",
467
+ "route_to_b",
468
+ "route_to_b",
469
+ "route_to_b",
470
+ "route_to_b",
471
+ "route_to_b",
472
+ "route_to_b",
473
+ "route_to_b",
474
+ "route_to_b",
475
+ "route_to_c",
476
+ "route_to_c",
477
+ "route_to_c",
478
+ "route_to_c",
479
+ "shed_load",
480
+ "shed_load"
481
+ ],
482
+ "rewards": [
483
+ -2.0454545454545454,
484
+ 0.7727272727272727,
485
+ 0.7727272727272727,
486
+ 0.7727272727272727,
487
+ 0.7727272727272727,
488
+ 0.7727272727272727,
489
+ 0.7727272727272727,
490
+ 0.7727272727272727,
491
+ 0.7727272727272727,
492
+ -2.2272727272727275,
493
+ -2.2272727272727275,
494
+ 0.7727272727272727,
495
+ 0.7727272727272727,
496
+ -2.2272727272727275,
497
+ 0.5454545454545454,
498
+ 0.5454545454545454,
499
+ 0.5454545454545454,
500
+ 0.5454545454545454,
501
+ -0.5,
502
+ -0.5
503
+ ]
504
+ },
505
+ {
506
+ "task": "hard_multi",
507
+ "seed": 8,
508
+ "policy": "heuristic",
509
+ "total_reward": -8.3509,
510
+ "grader_score": 0.5338,
511
+ "success_score": 0.6,
512
+ "budget_score": 0.2,
513
+ "adaptation_score": 0.5682,
514
+ "latency_score": 0.4135,
515
+ "sla_score": 0.85,
516
+ "success_rate": 0.6,
517
+ "steps": 20,
518
+ "actions": [
519
+ "route_to_a",
520
+ "route_to_a",
521
+ "route_to_a",
522
+ "route_to_a",
523
+ "route_to_a",
524
+ "route_to_a",
525
+ "route_to_a",
526
+ "route_to_a",
527
+ "route_to_b",
528
+ "route_to_b",
529
+ "route_to_b",
530
+ "route_to_b",
531
+ "route_to_b",
532
+ "route_to_b",
533
+ "route_to_b",
534
+ "route_to_b",
535
+ "route_to_c",
536
+ "route_to_c",
537
+ "route_to_c",
538
+ "route_to_c"
539
+ ],
540
+ "rewards": [
541
+ 0.9545454545454546,
542
+ -2.0454545454545454,
543
+ 0.9545454545454546,
544
+ 0.9545454545454546,
545
+ 0.9545454545454546,
546
+ -2.0454545454545454,
547
+ -2.0454545454545454,
548
+ -2.1359667972034475,
549
+ 0.7727272727272727,
550
+ -2.2272727272727275,
551
+ 0.7727272727272727,
552
+ -2.4320645744998868,
553
+ 0.7727272727272727,
554
+ 0.7727272727272727,
555
+ -2.2272727272727275,
556
+ -2.2828540896362535,
557
+ 0.5454545454545454,
558
+ 0.5454545454545454,
559
+ 0.5454545454545454,
560
+ 0.5454545454545454
561
+ ]
562
+ },
563
+ {
564
+ "task": "hard_multi",
565
+ "seed": 9,
566
+ "policy": "heuristic",
567
+ "total_reward": -1.0909,
568
+ "grader_score": 0.6315,
569
+ "success_score": 0.7,
570
+ "budget_score": 0.0818,
571
+ "adaptation_score": 0.7625,
572
+ "latency_score": 0.5336,
573
+ "sla_score": 1.0,
574
+ "success_rate": 0.7368,
575
+ "steps": 20,
576
+ "actions": [
577
+ "route_to_a",
578
+ "route_to_b",
579
+ "route_to_b",
580
+ "route_to_b",
581
+ "route_to_b",
582
+ "route_to_b",
583
+ "route_to_b",
584
+ "route_to_b",
585
+ "route_to_b",
586
+ "route_to_b",
587
+ "route_to_b",
588
+ "route_to_b",
589
+ "route_to_b",
590
+ "route_to_b",
591
+ "route_to_b",
592
+ "route_to_b",
593
+ "route_to_b",
594
+ "route_to_c",
595
+ "route_to_c",
596
+ "shed_load"
597
+ ],
598
+ "rewards": [
599
+ -2.0454545454545454,
600
+ 0.7727272727272727,
601
+ 0.7727272727272727,
602
+ 0.7727272727272727,
603
+ 0.7727272727272727,
604
+ 0.7727272727272727,
605
+ 0.7727272727272727,
606
+ 0.7727272727272727,
607
+ 0.7727272727272727,
608
+ 0.7727272727272727,
609
+ -2.2272727272727275,
610
+ 0.7727272727272727,
611
+ 0.7727272727272727,
612
+ -2.2272727272727275,
613
+ 0.7727272727272727,
614
+ -2.2272727272727275,
615
+ -2.2272727272727275,
616
+ 0.5454545454545454,
617
+ 0.5454545454545454,
618
+ -0.5
619
+ ]
620
+ },
621
+ {
622
+ "task": "hard_multi",
623
+ "seed": 0,
624
+ "policy": "llm",
625
+ "total_reward": -3.7273,
626
+ "grader_score": 0.5176,
627
+ "success_score": 0.8333,
628
+ "budget_score": 0.0,
629
+ "adaptation_score": 0.7946,
630
+ "latency_score": 0.656,
631
+ "sla_score": 1.0,
632
+ "success_rate": 0.8333,
633
+ "steps": 18,
634
+ "actions": [
635
+ "route_to_a",
636
+ "route_to_a",
637
+ "route_to_a",
638
+ "route_to_a",
639
+ "route_to_b",
640
+ "route_to_b",
641
+ "route_to_b",
642
+ "route_to_b",
643
+ "route_to_b",
644
+ "route_to_b",
645
+ "route_to_c",
646
+ "route_to_c",
647
+ "route_to_c",
648
+ "route_to_c",
649
+ "route_to_c",
650
+ "route_to_c",
651
+ "route_to_c",
652
+ "route_to_c"
653
+ ],
654
+ "rewards": [
655
+ 0.9545454545454546,
656
+ 0.9545454545454546,
657
+ 0.9545454545454546,
658
+ -2.0454545454545454,
659
+ 0.7727272727272727,
660
+ 0.7727272727272727,
661
+ 0.7727272727272727,
662
+ 0.7727272727272727,
663
+ 0.7727272727272727,
664
+ -2.2272727272727275,
665
+ 0.5454545454545454,
666
+ 0.5454545454545454,
667
+ 0.5454545454545454,
668
+ 0.5454545454545454,
669
+ 0.5454545454545454,
670
+ 0.5454545454545454,
671
+ 0.5454545454545454,
672
+ -10.0
673
+ ]
674
+ },
675
+ {
676
+ "task": "hard_multi",
677
+ "seed": 1,
678
+ "policy": "llm",
679
+ "total_reward": 6.1364,
680
+ "grader_score": 0.6994,
681
+ "success_score": 0.8,
682
+ "budget_score": 0.0273,
683
+ "adaptation_score": 0.8786,
684
+ "latency_score": 0.648,
685
+ "sla_score": 1.0,
686
+ "success_rate": 0.8889,
687
+ "steps": 20,
688
+ "actions": [
689
+ "route_to_a",
690
+ "route_to_a",
691
+ "route_to_b",
692
+ "route_to_b",
693
+ "route_to_b",
694
+ "route_to_b",
695
+ "route_to_b",
696
+ "route_to_b",
697
+ "route_to_b",
698
+ "route_to_b",
699
+ "route_to_b",
700
+ "route_to_b",
701
+ "route_to_b",
702
+ "route_to_c",
703
+ "route_to_c",
704
+ "route_to_c",
705
+ "route_to_c",
706
+ "route_to_c",
707
+ "shed_load",
708
+ "shed_load"
709
+ ],
710
+ "rewards": [
711
+ 0.9545454545454546,
712
+ -2.0454545454545454,
713
+ 0.7727272727272727,
714
+ 0.7727272727272727,
715
+ 0.7727272727272727,
716
+ 0.7727272727272727,
717
+ 0.7727272727272727,
718
+ 0.7727272727272727,
719
+ 0.7727272727272727,
720
+ 0.7727272727272727,
721
+ 0.7727272727272727,
722
+ 0.7727272727272727,
723
+ -2.2272727272727275,
724
+ 0.5454545454545454,
725
+ 0.5454545454545454,
726
+ 0.5454545454545454,
727
+ 0.5454545454545454,
728
+ 0.5454545454545454,
729
+ -0.5,
730
+ -0.5
731
+ ]
732
+ },
733
+ {
734
+ "task": "hard_multi",
735
+ "seed": 2,
736
+ "policy": "llm",
737
+ "total_reward": -1.5455,
738
+ "grader_score": 0.5204,
739
+ "success_score": 0.85,
740
+ "budget_score": 0.0,
741
+ "adaptation_score": 0.8071,
742
+ "latency_score": 0.6372,
743
+ "sla_score": 1.0,
744
+ "success_rate": 0.85,
745
+ "steps": 20,
746
+ "actions": [
747
+ "route_to_a",
748
+ "route_to_a",
749
+ "route_to_a",
750
+ "route_to_a",
751
+ "route_to_a",
752
+ "route_to_b",
753
+ "route_to_b",
754
+ "route_to_b",
755
+ "route_to_b",
756
+ "route_to_b",
757
+ "route_to_b",
758
+ "route_to_b",
759
+ "route_to_b",
760
+ "route_to_b",
761
+ "route_to_c",
762
+ "route_to_c",
763
+ "route_to_c",
764
+ "route_to_c",
765
+ "route_to_c",
766
+ "route_to_c"
767
+ ],
768
+ "rewards": [
769
+ 0.9545454545454546,
770
+ 0.9545454545454546,
771
+ 0.9545454545454546,
772
+ 0.9545454545454546,
773
+ -2.0454545454545454,
774
+ 0.7727272727272727,
775
+ 0.7727272727272727,
776
+ 0.7727272727272727,
777
+ 0.7727272727272727,
778
+ 0.7727272727272727,
779
+ 0.7727272727272727,
780
+ 0.7727272727272727,
781
+ 0.7727272727272727,
782
+ -2.2272727272727275,
783
+ 0.5454545454545454,
784
+ 0.5454545454545454,
785
+ 0.5454545454545454,
786
+ 0.5454545454545454,
787
+ 0.5454545454545454,
788
+ -10.0
789
+ ]
790
+ },
791
+ {
792
+ "task": "hard_multi",
793
+ "seed": 3,
794
+ "policy": "llm",
795
+ "total_reward": 9.0455,
796
+ "grader_score": 0.7388,
797
+ "success_score": 0.9,
798
+ "budget_score": 0.0091,
799
+ "adaptation_score": 0.8944,
800
+ "latency_score": 0.6926,
801
+ "sla_score": 1.0,
802
+ "success_rate": 0.9,
803
+ "steps": 20,
804
+ "actions": [
805
+ "route_to_a",
806
+ "route_to_a",
807
+ "route_to_a",
808
+ "route_to_a",
809
+ "route_to_b",
810
+ "route_to_b",
811
+ "route_to_b",
812
+ "route_to_b",
813
+ "route_to_b",
814
+ "route_to_b",
815
+ "route_to_b",
816
+ "route_to_b",
817
+ "route_to_b",
818
+ "route_to_b",
819
+ "route_to_b",
820
+ "route_to_c",
821
+ "route_to_c",
822
+ "route_to_c",
823
+ "route_to_c",
824
+ "route_to_c"
825
+ ],
826
+ "rewards": [
827
+ 0.9545454545454546,
828
+ 0.9545454545454546,
829
+ 0.9545454545454546,
830
+ -2.0454545454545454,
831
+ 0.7727272727272727,
832
+ 0.7727272727272727,
833
+ 0.7727272727272727,
834
+ 0.7727272727272727,
835
+ 0.7727272727272727,
836
+ 0.7727272727272727,
837
+ 0.7727272727272727,
838
+ 0.7727272727272727,
839
+ 0.7727272727272727,
840
+ 0.7727272727272727,
841
+ -2.2272727272727275,
842
+ 0.5454545454545454,
843
+ 0.5454545454545454,
844
+ 0.5454545454545454,
845
+ 0.5454545454545454,
846
+ 0.5454545454545454
847
+ ]
848
+ },
849
+ {
850
+ "task": "hard_multi",
851
+ "seed": 4,
852
+ "policy": "llm",
853
+ "total_reward": -9.6364,
854
+ "grader_score": 0.4732,
855
+ "success_score": 0.7222,
856
+ "budget_score": 0.0,
857
+ "adaptation_score": 0.7083,
858
+ "latency_score": 0.6132,
859
+ "sla_score": 1.0,
860
+ "success_rate": 0.7222,
861
+ "steps": 18,
862
+ "actions": [
863
+ "route_to_a",
864
+ "route_to_a",
865
+ "route_to_b",
866
+ "route_to_b",
867
+ "route_to_b",
868
+ "route_to_b",
869
+ "route_to_b",
870
+ "route_to_b",
871
+ "route_to_b",
872
+ "route_to_b",
873
+ "route_to_b",
874
+ "route_to_b",
875
+ "route_to_c",
876
+ "route_to_c",
877
+ "route_to_c",
878
+ "route_to_c",
879
+ "route_to_c",
880
+ "route_to_c"
881
+ ],
882
+ "rewards": [
883
+ 0.9545454545454546,
884
+ -2.0454545454545454,
885
+ 0.7727272727272727,
886
+ 0.7727272727272727,
887
+ -2.2272727272727275,
888
+ 0.7727272727272727,
889
+ 0.7727272727272727,
890
+ 0.7727272727272727,
891
+ 0.7727272727272727,
892
+ -2.2272727272727275,
893
+ 0.7727272727272727,
894
+ -2.2272727272727275,
895
+ 0.5454545454545454,
896
+ 0.5454545454545454,
897
+ 0.5454545454545454,
898
+ 0.5454545454545454,
899
+ 0.5454545454545454,
900
+ -10.0
901
+ ]
902
+ },
903
+ {
904
+ "task": "hard_multi",
905
+ "seed": 5,
906
+ "policy": "llm",
907
+ "total_reward": 1.9091,
908
+ "grader_score": 0.6665,
909
+ "success_score": 0.65,
910
+ "budget_score": 0.0818,
911
+ "adaptation_score": 0.9375,
912
+ "latency_score": 0.6085,
913
+ "sla_score": 1.0,
914
+ "success_rate": 0.8667,
915
+ "steps": 20,
916
+ "actions": [
917
+ "route_to_a",
918
+ "route_to_b",
919
+ "route_to_b",
920
+ "route_to_b",
921
+ "route_to_b",
922
+ "route_to_b",
923
+ "route_to_b",
924
+ "route_to_b",
925
+ "route_to_b",
926
+ "route_to_c",
927
+ "route_to_c",
928
+ "route_to_c",
929
+ "route_to_c",
930
+ "route_to_c",
931
+ "route_to_c",
932
+ "shed_load",
933
+ "shed_load",
934
+ "shed_load",
935
+ "shed_load",
936
+ "shed_load"
937
+ ],
938
+ "rewards": [
939
+ -2.0454545454545454,
940
+ 0.7727272727272727,
941
+ 0.7727272727272727,
942
+ 0.7727272727272727,
943
+ 0.7727272727272727,
944
+ 0.7727272727272727,
945
+ 0.7727272727272727,
946
+ 0.7727272727272727,
947
+ -2.2272727272727275,
948
+ 0.5454545454545454,
949
+ 0.5454545454545454,
950
+ 0.5454545454545454,
951
+ 0.5454545454545454,
952
+ 0.5454545454545454,
953
+ 0.5454545454545454,
954
+ -0.5,
955
+ -0.5,
956
+ -0.5,
957
+ -0.5,
958
+ -0.5
959
+ ]
960
+ },
961
+ {
962
+ "task": "hard_multi",
963
+ "seed": 6,
964
+ "policy": "llm",
965
+ "total_reward": 9.3182,
966
+ "grader_score": 0.7535,
967
+ "success_score": 0.9,
968
+ "budget_score": 0.0636,
969
+ "adaptation_score": 0.9444,
970
+ "latency_score": 0.6755,
971
+ "sla_score": 1.0,
972
+ "success_rate": 0.9,
973
+ "steps": 20,
974
+ "actions": [
975
+ "route_to_a",
976
+ "route_to_a",
977
+ "route_to_a",
978
+ "route_to_b",
979
+ "route_to_b",
980
+ "route_to_b",
981
+ "route_to_b",
982
+ "route_to_b",
983
+ "route_to_b",
984
+ "route_to_b",
985
+ "route_to_b",
986
+ "route_to_b",
987
+ "route_to_b",
988
+ "route_to_b",
989
+ "route_to_b",
990
+ "route_to_b",
991
+ "route_to_b",
992
+ "route_to_c",
993
+ "route_to_c",
994
+ "route_to_c"
995
+ ],
996
+ "rewards": [
997
+ 0.9545454545454546,
998
+ 0.9545454545454546,
999
+ -2.0454545454545454,
1000
+ 0.7727272727272727,
1001
+ 0.7727272727272727,
1002
+ 0.7727272727272727,
1003
+ 0.7727272727272727,
1004
+ 0.7727272727272727,
1005
+ 0.7727272727272727,
1006
+ 0.7727272727272727,
1007
+ 0.7727272727272727,
1008
+ 0.7727272727272727,
1009
+ 0.7727272727272727,
1010
+ 0.7727272727272727,
1011
+ 0.7727272727272727,
1012
+ 0.7727272727272727,
1013
+ -2.2272727272727275,
1014
+ 0.5454545454545454,
1015
+ 0.5454545454545454,
1016
+ 0.5454545454545454
1017
+ ]
1018
+ },
1019
+ {
1020
+ "task": "hard_multi",
1021
+ "seed": 7,
1022
+ "policy": "llm",
1023
+ "total_reward": 3.1818,
1024
+ "grader_score": 0.673,
1025
+ "success_score": 0.7,
1026
+ "budget_score": 0.0364,
1027
+ "adaptation_score": 0.9375,
1028
+ "latency_score": 0.6004,
1029
+ "sla_score": 1.0,
1030
+ "success_rate": 0.875,
1031
+ "steps": 20,
1032
+ "actions": [
1033
+ "route_to_a",
1034
+ "route_to_b",
1035
+ "route_to_b",
1036
+ "route_to_b",
1037
+ "route_to_b",
1038
+ "route_to_b",
1039
+ "route_to_b",
1040
+ "route_to_b",
1041
+ "route_to_b",
1042
+ "route_to_b",
1043
+ "route_to_c",
1044
+ "route_to_c",
1045
+ "route_to_c",
1046
+ "route_to_c",
1047
+ "route_to_c",
1048
+ "route_to_c",
1049
+ "shed_load",
1050
+ "shed_load",
1051
+ "shed_load",
1052
+ "shed_load"
1053
+ ],
1054
+ "rewards": [
1055
+ -2.0454545454545454,
1056
+ 0.7727272727272727,
1057
+ 0.7727272727272727,
1058
+ 0.7727272727272727,
1059
+ 0.7727272727272727,
1060
+ 0.7727272727272727,
1061
+ 0.7727272727272727,
1062
+ 0.7727272727272727,
1063
+ 0.7727272727272727,
1064
+ -2.2272727272727275,
1065
+ 0.5454545454545454,
1066
+ 0.5454545454545454,
1067
+ 0.5454545454545454,
1068
+ 0.5454545454545454,
1069
+ 0.5454545454545454,
1070
+ 0.5454545454545454,
1071
+ -0.5,
1072
+ -0.5,
1073
+ -0.5,
1074
+ -0.5
1075
+ ]
1076
+ },
1077
+ {
1078
+ "task": "hard_multi",
1079
+ "seed": 8,
1080
+ "policy": "llm",
1081
+ "total_reward": 3.3636,
1082
+ "grader_score": 0.6573,
1083
+ "success_score": 0.7,
1084
+ "budget_score": 0.0727,
1085
+ "adaptation_score": 0.8661,
1086
+ "latency_score": 0.5661,
1087
+ "sla_score": 1.0,
1088
+ "success_rate": 0.875,
1089
+ "steps": 20,
1090
+ "actions": [
1091
+ "route_to_a",
1092
+ "route_to_a",
1093
+ "route_to_b",
1094
+ "route_to_b",
1095
+ "route_to_b",
1096
+ "route_to_b",
1097
+ "route_to_b",
1098
+ "route_to_b",
1099
+ "route_to_b",
1100
+ "route_to_b",
1101
+ "route_to_c",
1102
+ "route_to_c",
1103
+ "route_to_c",
1104
+ "route_to_c",
1105
+ "route_to_c",
1106
+ "route_to_c",
1107
+ "shed_load",
1108
+ "shed_load",
1109
+ "shed_load",
1110
+ "shed_load"
1111
+ ],
1112
+ "rewards": [
1113
+ 0.9545454545454546,
1114
+ -2.0454545454545454,
1115
+ 0.7727272727272727,
1116
+ 0.7727272727272727,
1117
+ 0.7727272727272727,
1118
+ 0.7727272727272727,
1119
+ 0.7727272727272727,
1120
+ 0.7727272727272727,
1121
+ 0.7727272727272727,
1122
+ -2.2272727272727275,
1123
+ 0.5454545454545454,
1124
+ 0.5454545454545454,
1125
+ 0.5454545454545454,
1126
+ 0.5454545454545454,
1127
+ 0.5454545454545454,
1128
+ 0.5454545454545454,
1129
+ -0.5,
1130
+ -0.5,
1131
+ -0.5,
1132
+ -0.5
1133
+ ]
1134
+ },
1135
+ {
1136
+ "task": "hard_multi",
1137
+ "seed": 9,
1138
+ "policy": "llm",
1139
+ "total_reward": -4.5909,
1140
+ "grader_score": 0.5185,
1141
+ "success_score": 0.8235,
1142
+ "budget_score": 0.0,
1143
+ "adaptation_score": 0.8667,
1144
+ "latency_score": 0.605,
1145
+ "sla_score": 1.0,
1146
+ "success_rate": 0.8235,
1147
+ "steps": 17,
1148
+ "actions": [
1149
+ "route_to_a",
1150
+ "route_to_b",
1151
+ "route_to_b",
1152
+ "route_to_b",
1153
+ "route_to_b",
1154
+ "route_to_b",
1155
+ "route_to_b",
1156
+ "route_to_b",
1157
+ "route_to_b",
1158
+ "route_to_b",
1159
+ "route_to_b",
1160
+ "route_to_c",
1161
+ "route_to_c",
1162
+ "route_to_c",
1163
+ "route_to_c",
1164
+ "route_to_c",
1165
+ "route_to_c"
1166
+ ],
1167
+ "rewards": [
1168
+ -2.0454545454545454,
1169
+ 0.7727272727272727,
1170
+ 0.7727272727272727,
1171
+ 0.7727272727272727,
1172
+ 0.7727272727272727,
1173
+ 0.7727272727272727,
1174
+ 0.7727272727272727,
1175
+ 0.7727272727272727,
1176
+ 0.7727272727272727,
1177
+ 0.7727272727272727,
1178
+ -2.2272727272727275,
1179
+ 0.5454545454545454,
1180
+ 0.5454545454545454,
1181
+ 0.5454545454545454,
1182
+ 0.5454545454545454,
1183
+ 0.5454545454545454,
1184
+ -10.0
1185
+ ]
1186
+ }
1187
+ ]
1188
+ }
eval/outputs/prompt_audit/belief_v1_dev10/eval_summary_20260425_160429.md ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # Budget Router Evaluation — 20260425_160429
2
+
3
+ | Task | HEURISTIC Grader | LLM Grader | Notes |
4
+ |---|---|---|---|
5
+ | Hard_Multi | 0.6078 (n=10) | 0.6218 (n=10) | LLM +1.4 points vs heuristic |
eval/outputs/prompt_audit/belief_v1_heldout5/eval_results_20260425_160016.json ADDED
@@ -0,0 +1,615 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "timestamp": "20260425_160016",
4
+ "policies": [
5
+ "heuristic",
6
+ "llm"
7
+ ],
8
+ "tasks": [
9
+ "hard_multi"
10
+ ],
11
+ "seeds": [
12
+ 100,
13
+ 101,
14
+ 102,
15
+ 103,
16
+ 104
17
+ ]
18
+ },
19
+ "summary": {
20
+ "hard_multi|heuristic": {
21
+ "grader_mean": 0.6175,
22
+ "reward_mean": -2.1399,
23
+ "success_rate": 0.7108,
24
+ "adaptation": 0.7001,
25
+ "n": 5
26
+ },
27
+ "hard_multi|llm": {
28
+ "grader_mean": 0.6297,
29
+ "reward_mean": 1.4818,
30
+ "success_rate": 0.8462,
31
+ "adaptation": 0.8568,
32
+ "n": 5
33
+ }
34
+ },
35
+ "episodes": [
36
+ {
37
+ "task": "hard_multi",
38
+ "seed": 100,
39
+ "policy": "heuristic",
40
+ "total_reward": -7.0629,
41
+ "grader_score": 0.5459,
42
+ "success_score": 0.6,
43
+ "budget_score": 0.0909,
44
+ "adaptation_score": 0.6111,
45
+ "latency_score": 0.4399,
46
+ "sla_score": 0.9474,
47
+ "success_rate": 0.6316,
48
+ "steps": 20,
49
+ "actions": [
50
+ "route_to_a",
51
+ "route_to_a",
52
+ "route_to_a",
53
+ "route_to_a",
54
+ "route_to_a",
55
+ "route_to_b",
56
+ "route_to_b",
57
+ "route_to_b",
58
+ "route_to_b",
59
+ "route_to_b",
60
+ "route_to_b",
61
+ "route_to_b",
62
+ "route_to_b",
63
+ "route_to_b",
64
+ "route_to_c",
65
+ "route_to_c",
66
+ "route_to_c",
67
+ "route_to_c",
68
+ "route_to_c",
69
+ "shed_load"
70
+ ],
71
+ "rewards": [
72
+ 0.9545454545454546,
73
+ -2.0454545454545454,
74
+ 0.9545454545454546,
75
+ -2.0454545454545454,
76
+ -2.0454545454545454,
77
+ 0.7727272727272727,
78
+ -2.2272727272727275,
79
+ 0.7727272727272727,
80
+ 0.7727272727272727,
81
+ 0.7727272727272727,
82
+ -2.2272727272727275,
83
+ 0.7727272727272727,
84
+ -2.2272727272727275,
85
+ -2.2447259319640143,
86
+ 0.5454545454545454,
87
+ 0.5454545454545454,
88
+ 0.5454545454545454,
89
+ 0.5454545454545454,
90
+ 0.5454545454545454,
91
+ -0.5
92
+ ]
93
+ },
94
+ {
95
+ "task": "hard_multi",
96
+ "seed": 101,
97
+ "policy": "heuristic",
98
+ "total_reward": 3.4091,
99
+ "grader_score": 0.6753,
100
+ "success_score": 0.8,
101
+ "budget_score": 0.0818,
102
+ "adaptation_score": 0.7857,
103
+ "latency_score": 0.5795,
104
+ "sla_score": 1.0,
105
+ "success_rate": 0.8,
106
+ "steps": 20,
107
+ "actions": [
108
+ "route_to_a",
109
+ "route_to_b",
110
+ "route_to_b",
111
+ "route_to_b",
112
+ "route_to_b",
113
+ "route_to_b",
114
+ "route_to_b",
115
+ "route_to_b",
116
+ "route_to_b",
117
+ "route_to_b",
118
+ "route_to_b",
119
+ "route_to_b",
120
+ "route_to_b",
121
+ "route_to_b",
122
+ "route_to_b",
123
+ "route_to_b",
124
+ "route_to_b",
125
+ "route_to_b",
126
+ "route_to_b",
127
+ "route_to_c"
128
+ ],
129
+ "rewards": [
130
+ -2.0454545454545454,
131
+ 0.7727272727272727,
132
+ 0.7727272727272727,
133
+ 0.7727272727272727,
134
+ 0.7727272727272727,
135
+ 0.7727272727272727,
136
+ 0.7727272727272727,
137
+ 0.7727272727272727,
138
+ 0.7727272727272727,
139
+ 0.7727272727272727,
140
+ 0.7727272727272727,
141
+ 0.7727272727272727,
142
+ 0.7727272727272727,
143
+ 0.7727272727272727,
144
+ -2.2272727272727275,
145
+ 0.7727272727272727,
146
+ -2.2272727272727275,
147
+ 0.7727272727272727,
148
+ -2.2272727272727275,
149
+ 0.5454545454545454
150
+ ]
151
+ },
152
+ {
153
+ "task": "hard_multi",
154
+ "seed": 102,
155
+ "policy": "heuristic",
156
+ "total_reward": -2.5909,
157
+ "grader_score": 0.6228,
158
+ "success_score": 0.7,
159
+ "budget_score": 0.0818,
160
+ "adaptation_score": 0.6932,
161
+ "latency_score": 0.5593,
162
+ "sla_score": 1.0,
163
+ "success_rate": 0.7,
164
+ "steps": 20,
165
+ "actions": [
166
+ "route_to_a",
167
+ "route_to_a",
168
+ "route_to_a",
169
+ "route_to_a",
170
+ "route_to_a",
171
+ "route_to_a",
172
+ "route_to_b",
173
+ "route_to_b",
174
+ "route_to_b",
175
+ "route_to_b",
176
+ "route_to_b",
177
+ "route_to_b",
178
+ "route_to_b",
179
+ "route_to_b",
180
+ "route_to_b",
181
+ "route_to_c",
182
+ "route_to_c",
183
+ "route_to_c",
184
+ "route_to_c",
185
+ "route_to_c"
186
+ ],
187
+ "rewards": [
188
+ 0.9545454545454546,
189
+ 0.9545454545454546,
190
+ -2.0454545454545454,
191
+ -2.0454545454545454,
192
+ 0.9545454545454546,
193
+ -2.0454545454545454,
194
+ 0.7727272727272727,
195
+ 0.7727272727272727,
196
+ 0.7727272727272727,
197
+ 0.7727272727272727,
198
+ 0.7727272727272727,
199
+ -2.2272727272727275,
200
+ 0.7727272727272727,
201
+ -2.2272727272727275,
202
+ -2.2272727272727275,
203
+ 0.5454545454545454,
204
+ 0.5454545454545454,
205
+ 0.5454545454545454,
206
+ 0.5454545454545454,
207
+ 0.5454545454545454
208
+ ]
209
+ },
210
+ {
211
+ "task": "hard_multi",
212
+ "seed": 103,
213
+ "policy": "heuristic",
214
+ "total_reward": -2.8182,
215
+ "grader_score": 0.6003,
216
+ "success_score": 0.65,
217
+ "budget_score": 0.0364,
218
+ "adaptation_score": 0.75,
219
+ "latency_score": 0.4991,
220
+ "sla_score": 1.0,
221
+ "success_rate": 0.7222,
222
+ "steps": 20,
223
+ "actions": [
224
+ "route_to_a",
225
+ "route_to_b",
226
+ "route_to_b",
227
+ "route_to_b",
228
+ "route_to_b",
229
+ "route_to_b",
230
+ "route_to_b",
231
+ "route_to_b",
232
+ "route_to_b",
233
+ "route_to_b",
234
+ "route_to_b",
235
+ "route_to_b",
236
+ "route_to_b",
237
+ "route_to_b",
238
+ "route_to_c",
239
+ "route_to_c",
240
+ "route_to_c",
241
+ "route_to_c",
242
+ "shed_load",
243
+ "shed_load"
244
+ ],
245
+ "rewards": [
246
+ -2.0454545454545454,
247
+ 0.7727272727272727,
248
+ 0.7727272727272727,
249
+ 0.7727272727272727,
250
+ 0.7727272727272727,
251
+ -2.2272727272727275,
252
+ 0.7727272727272727,
253
+ 0.7727272727272727,
254
+ 0.7727272727272727,
255
+ 0.7727272727272727,
256
+ -2.2272727272727275,
257
+ -2.2272727272727275,
258
+ 0.7727272727272727,
259
+ -2.2272727272727275,
260
+ 0.5454545454545454,
261
+ 0.5454545454545454,
262
+ 0.5454545454545454,
263
+ 0.5454545454545454,
264
+ -0.5,
265
+ -0.5
266
+ ]
267
+ },
268
+ {
269
+ "task": "hard_multi",
270
+ "seed": 104,
271
+ "policy": "heuristic",
272
+ "total_reward": -1.6364,
273
+ "grader_score": 0.6432,
274
+ "success_score": 0.7,
275
+ "budget_score": 0.2727,
276
+ "adaptation_score": 0.6607,
277
+ "latency_score": 0.5509,
278
+ "sla_score": 1.0,
279
+ "success_rate": 0.7,
280
+ "steps": 20,
281
+ "actions": [
282
+ "route_to_a",
283
+ "route_to_a",
284
+ "route_to_a",
285
+ "route_to_a",
286
+ "route_to_a",
287
+ "route_to_b",
288
+ "route_to_b",
289
+ "route_to_b",
290
+ "route_to_b",
291
+ "route_to_b",
292
+ "route_to_b",
293
+ "route_to_b",
294
+ "route_to_b",
295
+ "route_to_b",
296
+ "route_to_b",
297
+ "route_to_b",
298
+ "route_to_b",
299
+ "route_to_b",
300
+ "route_to_b",
301
+ "route_to_b"
302
+ ],
303
+ "rewards": [
304
+ 0.9545454545454546,
305
+ -2.0454545454545454,
306
+ 0.9545454545454546,
307
+ -2.0454545454545454,
308
+ -2.0454545454545454,
309
+ 0.7727272727272727,
310
+ 0.7727272727272727,
311
+ 0.7727272727272727,
312
+ 0.7727272727272727,
313
+ 0.7727272727272727,
314
+ 0.7727272727272727,
315
+ 0.7727272727272727,
316
+ 0.7727272727272727,
317
+ -2.2272727272727275,
318
+ 0.7727272727272727,
319
+ 0.7727272727272727,
320
+ 0.7727272727272727,
321
+ -2.2272727272727275,
322
+ 0.7727272727272727,
323
+ -2.2272727272727275
324
+ ]
325
+ },
326
+ {
327
+ "task": "hard_multi",
328
+ "seed": 100,
329
+ "policy": "llm",
330
+ "total_reward": -1.1364,
331
+ "grader_score": 0.6114,
332
+ "success_score": 0.55,
333
+ "budget_score": 0.0727,
334
+ "adaptation_score": 0.8889,
335
+ "latency_score": 0.5387,
336
+ "sla_score": 1.0,
337
+ "success_rate": 0.8462,
338
+ "steps": 20,
339
+ "actions": [
340
+ "route_to_a",
341
+ "route_to_a",
342
+ "route_to_b",
343
+ "route_to_b",
344
+ "route_to_c",
345
+ "route_to_c",
346
+ "route_to_c",
347
+ "route_to_c",
348
+ "route_to_c",
349
+ "route_to_c",
350
+ "route_to_c",
351
+ "route_to_c",
352
+ "route_to_c",
353
+ "shed_load",
354
+ "shed_load",
355
+ "shed_load",
356
+ "shed_load",
357
+ "shed_load",
358
+ "shed_load",
359
+ "shed_load"
360
+ ],
361
+ "rewards": [
362
+ 0.9545454545454546,
363
+ -2.0454545454545454,
364
+ 0.7727272727272727,
365
+ -2.2272727272727275,
366
+ 0.5454545454545454,
367
+ 0.5454545454545454,
368
+ 0.5454545454545454,
369
+ 0.5454545454545454,
370
+ 0.5454545454545454,
371
+ 0.5454545454545454,
372
+ 0.5454545454545454,
373
+ 0.5454545454545454,
374
+ 0.5454545454545454,
375
+ -0.5,
376
+ -0.5,
377
+ -0.5,
378
+ -0.5,
379
+ -0.5,
380
+ -0.5,
381
+ -0.5
382
+ ]
383
+ },
384
+ {
385
+ "task": "hard_multi",
386
+ "seed": 101,
387
+ "policy": "llm",
388
+ "total_reward": -2.5909,
389
+ "grader_score": 0.5212,
390
+ "success_score": 0.8421,
391
+ "budget_score": 0.0,
392
+ "adaptation_score": 0.8333,
393
+ "latency_score": 0.6282,
394
+ "sla_score": 1.0,
395
+ "success_rate": 0.8421,
396
+ "steps": 19,
397
+ "actions": [
398
+ "route_to_a",
399
+ "route_to_b",
400
+ "route_to_b",
401
+ "route_to_b",
402
+ "route_to_b",
403
+ "route_to_b",
404
+ "route_to_b",
405
+ "route_to_b",
406
+ "route_to_b",
407
+ "route_to_b",
408
+ "route_to_b",
409
+ "route_to_b",
410
+ "route_to_b",
411
+ "route_to_b",
412
+ "route_to_b",
413
+ "route_to_c",
414
+ "route_to_c",
415
+ "route_to_c",
416
+ "route_to_c"
417
+ ],
418
+ "rewards": [
419
+ -2.0454545454545454,
420
+ 0.7727272727272727,
421
+ 0.7727272727272727,
422
+ 0.7727272727272727,
423
+ 0.7727272727272727,
424
+ 0.7727272727272727,
425
+ 0.7727272727272727,
426
+ 0.7727272727272727,
427
+ 0.7727272727272727,
428
+ 0.7727272727272727,
429
+ 0.7727272727272727,
430
+ 0.7727272727272727,
431
+ 0.7727272727272727,
432
+ 0.7727272727272727,
433
+ -2.2272727272727275,
434
+ 0.5454545454545454,
435
+ 0.5454545454545454,
436
+ 0.5454545454545454,
437
+ -10.0
438
+ ]
439
+ },
440
+ {
441
+ "task": "hard_multi",
442
+ "seed": 102,
443
+ "policy": "llm",
444
+ "total_reward": 6.0909,
445
+ "grader_score": 0.707,
446
+ "success_score": 0.8,
447
+ "budget_score": 0.0182,
448
+ "adaptation_score": 0.9091,
449
+ "latency_score": 0.6621,
450
+ "sla_score": 1.0,
451
+ "success_rate": 0.8889,
452
+ "steps": 20,
453
+ "actions": [
454
+ "route_to_a",
455
+ "route_to_a",
456
+ "route_to_a",
457
+ "route_to_b",
458
+ "route_to_b",
459
+ "route_to_b",
460
+ "route_to_b",
461
+ "route_to_b",
462
+ "route_to_b",
463
+ "route_to_b",
464
+ "route_to_b",
465
+ "route_to_b",
466
+ "route_to_c",
467
+ "route_to_c",
468
+ "route_to_c",
469
+ "route_to_c",
470
+ "route_to_c",
471
+ "route_to_c",
472
+ "shed_load",
473
+ "shed_load"
474
+ ],
475
+ "rewards": [
476
+ 0.9545454545454546,
477
+ 0.9545454545454546,
478
+ -2.0454545454545454,
479
+ 0.7727272727272727,
480
+ 0.7727272727272727,
481
+ 0.7727272727272727,
482
+ 0.7727272727272727,
483
+ 0.7727272727272727,
484
+ 0.7727272727272727,
485
+ 0.7727272727272727,
486
+ 0.7727272727272727,
487
+ -2.2272727272727275,
488
+ 0.5454545454545454,
489
+ 0.5454545454545454,
490
+ 0.5454545454545454,
491
+ 0.5454545454545454,
492
+ 0.5454545454545454,
493
+ 0.5454545454545454,
494
+ -0.5,
495
+ -0.5
496
+ ]
497
+ },
498
+ {
499
+ "task": "hard_multi",
500
+ "seed": 103,
501
+ "policy": "llm",
502
+ "total_reward": -1.3182,
503
+ "grader_score": 0.6135,
504
+ "success_score": 0.65,
505
+ "budget_score": 0.0364,
506
+ "adaptation_score": 0.7946,
507
+ "latency_score": 0.5208,
508
+ "sla_score": 1.0,
509
+ "success_rate": 0.7647,
510
+ "steps": 20,
511
+ "actions": [
512
+ "route_to_a",
513
+ "route_to_b",
514
+ "route_to_b",
515
+ "route_to_b",
516
+ "route_to_b",
517
+ "route_to_b",
518
+ "route_to_b",
519
+ "route_to_b",
520
+ "route_to_b",
521
+ "route_to_b",
522
+ "route_to_b",
523
+ "route_to_b",
524
+ "route_to_c",
525
+ "route_to_c",
526
+ "route_to_c",
527
+ "route_to_c",
528
+ "route_to_c",
529
+ "shed_load",
530
+ "shed_load",
531
+ "shed_load"
532
+ ],
533
+ "rewards": [
534
+ -2.0454545454545454,
535
+ 0.7727272727272727,
536
+ 0.7727272727272727,
537
+ 0.7727272727272727,
538
+ 0.7727272727272727,
539
+ -2.2272727272727275,
540
+ 0.7727272727272727,
541
+ 0.7727272727272727,
542
+ 0.7727272727272727,
543
+ 0.7727272727272727,
544
+ -2.2272727272727275,
545
+ -2.2272727272727275,
546
+ 0.5454545454545454,
547
+ 0.5454545454545454,
548
+ 0.5454545454545454,
549
+ 0.5454545454545454,
550
+ 0.5454545454545454,
551
+ -0.5,
552
+ -0.5,
553
+ -0.5
554
+ ]
555
+ },
556
+ {
557
+ "task": "hard_multi",
558
+ "seed": 104,
559
+ "policy": "llm",
560
+ "total_reward": 6.3636,
561
+ "grader_score": 0.6953,
562
+ "success_score": 0.8,
563
+ "budget_score": 0.0727,
564
+ "adaptation_score": 0.8583,
565
+ "latency_score": 0.6138,
566
+ "sla_score": 1.0,
567
+ "success_rate": 0.8889,
568
+ "steps": 20,
569
+ "actions": [
570
+ "route_to_a",
571
+ "route_to_a",
572
+ "route_to_b",
573
+ "route_to_b",
574
+ "route_to_b",
575
+ "route_to_b",
576
+ "route_to_b",
577
+ "route_to_b",
578
+ "route_to_b",
579
+ "route_to_b",
580
+ "route_to_b",
581
+ "route_to_b",
582
+ "route_to_b",
583
+ "route_to_b",
584
+ "route_to_c",
585
+ "route_to_c",
586
+ "route_to_c",
587
+ "route_to_c",
588
+ "shed_load",
589
+ "shed_load"
590
+ ],
591
+ "rewards": [
592
+ 0.9545454545454546,
593
+ -2.0454545454545454,
594
+ 0.7727272727272727,
595
+ 0.7727272727272727,
596
+ 0.7727272727272727,
597
+ 0.7727272727272727,
598
+ 0.7727272727272727,
599
+ 0.7727272727272727,
600
+ 0.7727272727272727,
601
+ 0.7727272727272727,
602
+ 0.7727272727272727,
603
+ 0.7727272727272727,
604
+ 0.7727272727272727,
605
+ -2.2272727272727275,
606
+ 0.5454545454545454,
607
+ 0.5454545454545454,
608
+ 0.5454545454545454,
609
+ 0.5454545454545454,
610
+ -0.5,
611
+ -0.5
612
+ ]
613
+ }
614
+ ]
615
+ }
eval/outputs/prompt_audit/belief_v1_heldout5/eval_summary_20260425_160016.md ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # Budget Router Evaluation — 20260425_160016
2
+
3
+ | Task | HEURISTIC Grader | LLM Grader | Notes |
4
+ |---|---|---|---|
5
+ | Hard_Multi | 0.6175 (n=5) | 0.6297 (n=5) | LLM +1.2 points vs heuristic |
eval/outputs/prompt_audit/budget_guard_alltasks_dev3/eval_results_20260425_165910.json ADDED
@@ -0,0 +1,1468 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "timestamp": "20260425_165910",
4
+ "policies": [
5
+ "heuristic",
6
+ "llm"
7
+ ],
8
+ "tasks": [
9
+ "easy",
10
+ "medium",
11
+ "hard",
12
+ "hard_multi"
13
+ ],
14
+ "seeds": [
15
+ 0,
16
+ 1,
17
+ 2
18
+ ]
19
+ },
20
+ "summary": {
21
+ "easy|heuristic": {
22
+ "grader_mean": 0.7734,
23
+ "reward_mean": 9.4667,
24
+ "success_rate": 0.8833,
25
+ "adaptation": 0.8833,
26
+ "n": 3
27
+ },
28
+ "medium|heuristic": {
29
+ "grader_mean": 0.6187,
30
+ "reward_mean": -1.5088,
31
+ "success_rate": 0.75,
32
+ "adaptation": 0.7333,
33
+ "n": 3
34
+ },
35
+ "hard|heuristic": {
36
+ "grader_mean": 0.5491,
37
+ "reward_mean": -4.7909,
38
+ "success_rate": 0.7593,
39
+ "adaptation": 0.7868,
40
+ "n": 3
41
+ },
42
+ "hard_multi|heuristic": {
43
+ "grader_mean": 0.5937,
44
+ "reward_mean": -3.0795,
45
+ "success_rate": 0.6947,
46
+ "adaptation": 0.6407,
47
+ "n": 3
48
+ },
49
+ "easy|llm": {
50
+ "grader_mean": 0.7044,
51
+ "reward_mean": 7.4833,
52
+ "success_rate": 0.8952,
53
+ "adaptation": 0.8167,
54
+ "n": 3
55
+ },
56
+ "medium|llm": {
57
+ "grader_mean": 0.6559,
58
+ "reward_mean": 1.1492,
59
+ "success_rate": 0.8061,
60
+ "adaptation": 0.8111,
61
+ "n": 3
62
+ },
63
+ "hard|llm": {
64
+ "grader_mean": 0.6196,
65
+ "reward_mean": -0.7321,
66
+ "success_rate": 0.8446,
67
+ "adaptation": 0.8796,
68
+ "n": 3
69
+ },
70
+ "hard_multi|llm": {
71
+ "grader_mean": 0.6989,
72
+ "reward_mean": 6.2879,
73
+ "success_rate": 0.8887,
74
+ "adaptation": 0.8675,
75
+ "n": 3
76
+ }
77
+ },
78
+ "episodes": [
79
+ {
80
+ "task": "easy",
81
+ "seed": 0,
82
+ "policy": "heuristic",
83
+ "total_reward": 12.2,
84
+ "grader_score": 0.7709,
85
+ "success_score": 0.95,
86
+ "budget_score": 0.04,
87
+ "adaptation_score": 0.95,
88
+ "latency_score": 0.6993,
89
+ "sla_score": 1.0,
90
+ "success_rate": 0.95,
91
+ "steps": 20,
92
+ "actions": [
93
+ "route_to_a",
94
+ "route_to_b",
95
+ "route_to_b",
96
+ "route_to_b",
97
+ "route_to_b",
98
+ "route_to_b",
99
+ "route_to_b",
100
+ "route_to_b",
101
+ "route_to_b",
102
+ "route_to_b",
103
+ "route_to_b",
104
+ "route_to_b",
105
+ "route_to_b",
106
+ "route_to_b",
107
+ "route_to_b",
108
+ "route_to_b",
109
+ "route_to_b",
110
+ "route_to_b",
111
+ "route_to_b",
112
+ "route_to_b"
113
+ ],
114
+ "rewards": [
115
+ -2.05,
116
+ 0.75,
117
+ 0.75,
118
+ 0.75,
119
+ 0.75,
120
+ 0.75,
121
+ 0.75,
122
+ 0.75,
123
+ 0.75,
124
+ 0.75,
125
+ 0.75,
126
+ 0.75,
127
+ 0.75,
128
+ 0.75,
129
+ 0.75,
130
+ 0.75,
131
+ 0.75,
132
+ 0.75,
133
+ 0.75,
134
+ 0.75
135
+ ]
136
+ },
137
+ {
138
+ "task": "easy",
139
+ "seed": 1,
140
+ "policy": "heuristic",
141
+ "total_reward": 10.0,
142
+ "grader_score": 0.8422,
143
+ "success_score": 0.85,
144
+ "budget_score": 0.8,
145
+ "adaptation_score": 0.85,
146
+ "latency_score": 0.7358,
147
+ "sla_score": 1.0,
148
+ "success_rate": 0.85,
149
+ "steps": 20,
150
+ "actions": [
151
+ "route_to_a",
152
+ "route_to_a",
153
+ "route_to_a",
154
+ "route_to_a",
155
+ "route_to_a",
156
+ "route_to_a",
157
+ "route_to_a",
158
+ "route_to_a",
159
+ "route_to_a",
160
+ "route_to_a",
161
+ "route_to_a",
162
+ "route_to_a",
163
+ "route_to_a",
164
+ "route_to_a",
165
+ "route_to_a",
166
+ "route_to_a",
167
+ "route_to_a",
168
+ "route_to_a",
169
+ "route_to_a",
170
+ "route_to_a"
171
+ ],
172
+ "rewards": [
173
+ 0.95,
174
+ 0.95,
175
+ 0.95,
176
+ -2.05,
177
+ 0.95,
178
+ 0.95,
179
+ -2.05,
180
+ 0.95,
181
+ 0.95,
182
+ 0.95,
183
+ 0.95,
184
+ 0.95,
185
+ 0.95,
186
+ 0.95,
187
+ 0.95,
188
+ 0.95,
189
+ 0.95,
190
+ 0.95,
191
+ -2.05,
192
+ 0.95
193
+ ]
194
+ },
195
+ {
196
+ "task": "easy",
197
+ "seed": 2,
198
+ "policy": "heuristic",
199
+ "total_reward": 6.2,
200
+ "grader_score": 0.7071,
201
+ "success_score": 0.85,
202
+ "budget_score": 0.04,
203
+ "adaptation_score": 0.85,
204
+ "latency_score": 0.6306,
205
+ "sla_score": 1.0,
206
+ "success_rate": 0.85,
207
+ "steps": 20,
208
+ "actions": [
209
+ "route_to_a",
210
+ "route_to_b",
211
+ "route_to_b",
212
+ "route_to_b",
213
+ "route_to_b",
214
+ "route_to_b",
215
+ "route_to_b",
216
+ "route_to_b",
217
+ "route_to_b",
218
+ "route_to_b",
219
+ "route_to_b",
220
+ "route_to_b",
221
+ "route_to_b",
222
+ "route_to_b",
223
+ "route_to_b",
224
+ "route_to_b",
225
+ "route_to_b",
226
+ "route_to_b",
227
+ "route_to_b",
228
+ "route_to_b"
229
+ ],
230
+ "rewards": [
231
+ -2.05,
232
+ 0.75,
233
+ 0.75,
234
+ 0.75,
235
+ 0.75,
236
+ 0.75,
237
+ 0.75,
238
+ -2.25,
239
+ -2.25,
240
+ 0.75,
241
+ 0.75,
242
+ 0.75,
243
+ 0.75,
244
+ 0.75,
245
+ 0.75,
246
+ 0.75,
247
+ 0.75,
248
+ 0.75,
249
+ 0.75,
250
+ 0.75
251
+ ]
252
+ },
253
+ {
254
+ "task": "medium",
255
+ "seed": 0,
256
+ "policy": "heuristic",
257
+ "total_reward": 1.2105,
258
+ "grader_score": 0.6776,
259
+ "success_score": 0.75,
260
+ "budget_score": 0.2421,
261
+ "adaptation_score": 0.7333,
262
+ "latency_score": 0.5979,
263
+ "sla_score": 1.0,
264
+ "success_rate": 0.75,
265
+ "steps": 20,
266
+ "actions": [
267
+ "route_to_a",
268
+ "route_to_a",
269
+ "route_to_a",
270
+ "route_to_a",
271
+ "route_to_a",
272
+ "route_to_a",
273
+ "route_to_a",
274
+ "route_to_b",
275
+ "route_to_b",
276
+ "route_to_b",
277
+ "route_to_b",
278
+ "route_to_b",
279
+ "route_to_b",
280
+ "route_to_b",
281
+ "route_to_b",
282
+ "route_to_b",
283
+ "route_to_b",
284
+ "route_to_b",
285
+ "route_to_b",
286
+ "route_to_b"
287
+ ],
288
+ "rewards": [
289
+ 0.9473684210526315,
290
+ 0.9473684210526315,
291
+ 0.9473684210526315,
292
+ 0.9473684210526315,
293
+ -2.0526315789473686,
294
+ -2.0526315789473686,
295
+ -2.0526315789473686,
296
+ 0.7368421052631579,
297
+ 0.7368421052631579,
298
+ 0.7368421052631579,
299
+ 0.7368421052631579,
300
+ 0.7368421052631579,
301
+ 0.7368421052631579,
302
+ 0.7368421052631579,
303
+ 0.7368421052631579,
304
+ 0.7368421052631579,
305
+ 0.7368421052631579,
306
+ -2.263157894736842,
307
+ 0.7368421052631579,
308
+ -2.263157894736842
309
+ ]
310
+ },
311
+ {
312
+ "task": "medium",
313
+ "seed": 1,
314
+ "policy": "heuristic",
315
+ "total_reward": -0.9474,
316
+ "grader_score": 0.6688,
317
+ "success_score": 0.7,
318
+ "budget_score": 0.4105,
319
+ "adaptation_score": 0.6667,
320
+ "latency_score": 0.5696,
321
+ "sla_score": 1.0,
322
+ "success_rate": 0.7,
323
+ "steps": 20,
324
+ "actions": [
325
+ "route_to_a",
326
+ "route_to_a",
327
+ "route_to_a",
328
+ "route_to_a",
329
+ "route_to_a",
330
+ "route_to_a",
331
+ "route_to_a",
332
+ "route_to_a",
333
+ "route_to_a",
334
+ "route_to_a",
335
+ "route_to_a",
336
+ "route_to_b",
337
+ "route_to_b",
338
+ "route_to_b",
339
+ "route_to_b",
340
+ "route_to_b",
341
+ "route_to_b",
342
+ "route_to_b",
343
+ "route_to_b",
344
+ "route_to_b"
345
+ ],
346
+ "rewards": [
347
+ 0.9473684210526315,
348
+ 0.9473684210526315,
349
+ 0.9473684210526315,
350
+ 0.9473684210526315,
351
+ -2.0526315789473686,
352
+ 0.9473684210526315,
353
+ 0.9473684210526315,
354
+ 0.9473684210526315,
355
+ -2.0526315789473686,
356
+ -2.0526315789473686,
357
+ -2.0526315789473686,
358
+ 0.7368421052631579,
359
+ 0.7368421052631579,
360
+ 0.7368421052631579,
361
+ -2.263157894736842,
362
+ 0.7368421052631579,
363
+ -2.263157894736842,
364
+ 0.7368421052631579,
365
+ 0.7368421052631579,
366
+ 0.7368421052631579
367
+ ]
368
+ },
369
+ {
370
+ "task": "medium",
371
+ "seed": 2,
372
+ "policy": "heuristic",
373
+ "total_reward": -4.7895,
374
+ "grader_score": 0.5097,
375
+ "success_score": 0.8,
376
+ "budget_score": 0.0,
377
+ "adaptation_score": 0.8,
378
+ "latency_score": 0.6483,
379
+ "sla_score": 1.0,
380
+ "success_rate": 0.8,
381
+ "steps": 20,
382
+ "actions": [
383
+ "route_to_a",
384
+ "route_to_b",
385
+ "route_to_b",
386
+ "route_to_b",
387
+ "route_to_b",
388
+ "route_to_b",
389
+ "route_to_b",
390
+ "route_to_b",
391
+ "route_to_b",
392
+ "route_to_b",
393
+ "route_to_b",
394
+ "route_to_b",
395
+ "route_to_b",
396
+ "route_to_b",
397
+ "route_to_b",
398
+ "route_to_b",
399
+ "route_to_b",
400
+ "route_to_b",
401
+ "route_to_b",
402
+ "route_to_b"
403
+ ],
404
+ "rewards": [
405
+ -2.0526315789473686,
406
+ 0.7368421052631579,
407
+ 0.7368421052631579,
408
+ 0.7368421052631579,
409
+ 0.7368421052631579,
410
+ 0.7368421052631579,
411
+ -2.263157894736842,
412
+ 0.7368421052631579,
413
+ 0.7368421052631579,
414
+ 0.7368421052631579,
415
+ 0.7368421052631579,
416
+ 0.7368421052631579,
417
+ 0.7368421052631579,
418
+ 0.7368421052631579,
419
+ 0.7368421052631579,
420
+ -2.263157894736842,
421
+ 0.7368421052631579,
422
+ 0.7368421052631579,
423
+ 0.7368421052631579,
424
+ -10.0
425
+ ]
426
+ },
427
+ {
428
+ "task": "hard",
429
+ "seed": 0,
430
+ "policy": "heuristic",
431
+ "total_reward": -7.8824,
432
+ "grader_score": 0.4999,
433
+ "success_score": 0.75,
434
+ "budget_score": 0.0,
435
+ "adaptation_score": 0.8235,
436
+ "latency_score": 0.6343,
437
+ "sla_score": 1.0,
438
+ "success_rate": 0.75,
439
+ "steps": 20,
440
+ "actions": [
441
+ "route_to_a",
442
+ "route_to_a",
443
+ "route_to_a",
444
+ "route_to_b",
445
+ "route_to_b",
446
+ "route_to_b",
447
+ "route_to_b",
448
+ "route_to_b",
449
+ "route_to_b",
450
+ "route_to_b",
451
+ "route_to_b",
452
+ "route_to_b",
453
+ "route_to_b",
454
+ "route_to_b",
455
+ "route_to_b",
456
+ "route_to_b",
457
+ "route_to_b",
458
+ "route_to_b",
459
+ "route_to_b",
460
+ "route_to_b"
461
+ ],
462
+ "rewards": [
463
+ 0.9411764705882353,
464
+ -2.0588235294117645,
465
+ -2.0588235294117645,
466
+ 0.7058823529411764,
467
+ 0.7058823529411764,
468
+ -2.2941176470588234,
469
+ 0.7058823529411764,
470
+ 0.7058823529411764,
471
+ 0.7058823529411764,
472
+ 0.7058823529411764,
473
+ 0.7058823529411764,
474
+ 0.7058823529411764,
475
+ 0.7058823529411764,
476
+ 0.7058823529411764,
477
+ 0.7058823529411764,
478
+ 0.7058823529411764,
479
+ 0.7058823529411764,
480
+ 0.7058823529411764,
481
+ -2.2941176470588234,
482
+ -10.0
483
+ ]
484
+ },
485
+ {
486
+ "task": "hard",
487
+ "seed": 1,
488
+ "policy": "heuristic",
489
+ "total_reward": 0.2941,
490
+ "grader_score": 0.6506,
491
+ "success_score": 0.75,
492
+ "budget_score": 0.0588,
493
+ "adaptation_score": 0.7368,
494
+ "latency_score": 0.5973,
495
+ "sla_score": 1.0,
496
+ "success_rate": 0.75,
497
+ "steps": 20,
498
+ "actions": [
499
+ "route_to_a",
500
+ "route_to_a",
501
+ "route_to_a",
502
+ "route_to_a",
503
+ "route_to_a",
504
+ "route_to_b",
505
+ "route_to_b",
506
+ "route_to_b",
507
+ "route_to_b",
508
+ "route_to_b",
509
+ "route_to_b",
510
+ "route_to_b",
511
+ "route_to_b",
512
+ "route_to_b",
513
+ "route_to_b",
514
+ "route_to_b",
515
+ "route_to_b",
516
+ "route_to_b",
517
+ "route_to_b",
518
+ "route_to_b"
519
+ ],
520
+ "rewards": [
521
+ 0.9411764705882353,
522
+ 0.9411764705882353,
523
+ -2.0588235294117645,
524
+ -2.0588235294117645,
525
+ -2.0588235294117645,
526
+ 0.7058823529411764,
527
+ 0.7058823529411764,
528
+ 0.7058823529411764,
529
+ 0.7058823529411764,
530
+ 0.7058823529411764,
531
+ 0.7058823529411764,
532
+ 0.7058823529411764,
533
+ 0.7058823529411764,
534
+ 0.7058823529411764,
535
+ -2.2941176470588234,
536
+ 0.7058823529411764,
537
+ 0.7058823529411764,
538
+ -2.2941176470588234,
539
+ 0.7058823529411764,
540
+ 0.7058823529411764
541
+ ]
542
+ },
543
+ {
544
+ "task": "hard",
545
+ "seed": 2,
546
+ "policy": "heuristic",
547
+ "total_reward": -6.7845,
548
+ "grader_score": 0.4969,
549
+ "success_score": 0.7778,
550
+ "budget_score": 0.0,
551
+ "adaptation_score": 0.8,
552
+ "latency_score": 0.6374,
553
+ "sla_score": 0.9444,
554
+ "success_rate": 0.7778,
555
+ "steps": 18,
556
+ "actions": [
557
+ "route_to_a",
558
+ "route_to_b",
559
+ "route_to_b",
560
+ "route_to_b",
561
+ "route_to_b",
562
+ "route_to_b",
563
+ "route_to_b",
564
+ "route_to_b",
565
+ "route_to_b",
566
+ "route_to_b",
567
+ "route_to_b",
568
+ "route_to_b",
569
+ "route_to_b",
570
+ "route_to_b",
571
+ "route_to_b",
572
+ "route_to_b",
573
+ "route_to_b",
574
+ "route_to_b"
575
+ ],
576
+ "rewards": [
577
+ -2.0588235294117645,
578
+ 0.7058823529411764,
579
+ 0.7058823529411764,
580
+ 0.7058823529411764,
581
+ 0.7058823529411764,
582
+ -2.3138982657484135,
583
+ 0.7058823529411764,
584
+ 0.7058823529411764,
585
+ 0.7058823529411764,
586
+ 0.7058823529411764,
587
+ 0.7058823529411764,
588
+ 0.7058823529411764,
589
+ 0.7058823529411764,
590
+ 0.7058823529411764,
591
+ -2.2941176470588234,
592
+ 0.7058823529411764,
593
+ 0.7058823529411764,
594
+ -10.0
595
+ ]
596
+ },
597
+ {
598
+ "task": "hard_multi",
599
+ "seed": 0,
600
+ "policy": "heuristic",
601
+ "total_reward": -4.4659,
602
+ "grader_score": 0.5569,
603
+ "success_score": 0.65,
604
+ "budget_score": 0.0364,
605
+ "adaptation_score": 0.6032,
606
+ "latency_score": 0.4686,
607
+ "sla_score": 0.9474,
608
+ "success_rate": 0.6842,
609
+ "steps": 20,
610
+ "actions": [
611
+ "route_to_a",
612
+ "route_to_a",
613
+ "route_to_a",
614
+ "route_to_a",
615
+ "route_to_a",
616
+ "route_to_a",
617
+ "route_to_b",
618
+ "route_to_b",
619
+ "route_to_b",
620
+ "route_to_b",
621
+ "route_to_b",
622
+ "route_to_b",
623
+ "route_to_c",
624
+ "route_to_c",
625
+ "route_to_c",
626
+ "route_to_c",
627
+ "route_to_c",
628
+ "route_to_c",
629
+ "route_to_c",
630
+ "shed_load"
631
+ ],
632
+ "rewards": [
633
+ 0.9545454545454546,
634
+ 0.9545454545454546,
635
+ 0.9545454545454546,
636
+ -2.0454545454545454,
637
+ -2.0454545454545454,
638
+ -2.0454545454545454,
639
+ 0.7727272727272727,
640
+ 0.7727272727272727,
641
+ 0.7727272727272727,
642
+ -2.2272727272727275,
643
+ -2.2272727272727275,
644
+ -2.3750364951788474,
645
+ 0.5454545454545454,
646
+ 0.5454545454545454,
647
+ 0.5454545454545454,
648
+ 0.5454545454545454,
649
+ 0.5454545454545454,
650
+ 0.5454545454545454,
651
+ 0.5454545454545454,
652
+ -0.5
653
+ ]
654
+ },
655
+ {
656
+ "task": "hard_multi",
657
+ "seed": 1,
658
+ "policy": "heuristic",
659
+ "total_reward": -2.7727,
660
+ "grader_score": 0.6077,
661
+ "success_score": 0.7,
662
+ "budget_score": 0.0455,
663
+ "adaptation_score": 0.6833,
664
+ "latency_score": 0.5213,
665
+ "sla_score": 1.0,
666
+ "success_rate": 0.7,
667
+ "steps": 20,
668
+ "actions": [
669
+ "route_to_a",
670
+ "route_to_a",
671
+ "route_to_a",
672
+ "route_to_a",
673
+ "route_to_a",
674
+ "route_to_b",
675
+ "route_to_b",
676
+ "route_to_b",
677
+ "route_to_b",
678
+ "route_to_b",
679
+ "route_to_b",
680
+ "route_to_b",
681
+ "route_to_b",
682
+ "route_to_b",
683
+ "route_to_b",
684
+ "route_to_c",
685
+ "route_to_c",
686
+ "route_to_c",
687
+ "route_to_c",
688
+ "route_to_c"
689
+ ],
690
+ "rewards": [
691
+ 0.9545454545454546,
692
+ -2.0454545454545454,
693
+ 0.9545454545454546,
694
+ -2.0454545454545454,
695
+ -2.0454545454545454,
696
+ 0.7727272727272727,
697
+ 0.7727272727272727,
698
+ 0.7727272727272727,
699
+ 0.7727272727272727,
700
+ 0.7727272727272727,
701
+ 0.7727272727272727,
702
+ 0.7727272727272727,
703
+ -2.2272727272727275,
704
+ -2.2272727272727275,
705
+ -2.2272727272727275,
706
+ 0.5454545454545454,
707
+ 0.5454545454545454,
708
+ 0.5454545454545454,
709
+ 0.5454545454545454,
710
+ 0.5454545454545454
711
+ ]
712
+ },
713
+ {
714
+ "task": "hard_multi",
715
+ "seed": 2,
716
+ "policy": "heuristic",
717
+ "total_reward": -2.0,
718
+ "grader_score": 0.6165,
719
+ "success_score": 0.7,
720
+ "budget_score": 0.2,
721
+ "adaptation_score": 0.6357,
722
+ "latency_score": 0.4967,
723
+ "sla_score": 1.0,
724
+ "success_rate": 0.7,
725
+ "steps": 20,
726
+ "actions": [
727
+ "route_to_a",
728
+ "route_to_a",
729
+ "route_to_a",
730
+ "route_to_a",
731
+ "route_to_a",
732
+ "route_to_a",
733
+ "route_to_a",
734
+ "route_to_a",
735
+ "route_to_b",
736
+ "route_to_b",
737
+ "route_to_b",
738
+ "route_to_b",
739
+ "route_to_b",
740
+ "route_to_b",
741
+ "route_to_b",
742
+ "route_to_b",
743
+ "route_to_c",
744
+ "route_to_c",
745
+ "route_to_c",
746
+ "route_to_c"
747
+ ],
748
+ "rewards": [
749
+ 0.9545454545454546,
750
+ 0.9545454545454546,
751
+ 0.9545454545454546,
752
+ 0.9545454545454546,
753
+ -2.0454545454545454,
754
+ -2.0454545454545454,
755
+ 0.9545454545454546,
756
+ -2.0454545454545454,
757
+ 0.7727272727272727,
758
+ 0.7727272727272727,
759
+ 0.7727272727272727,
760
+ 0.7727272727272727,
761
+ 0.7727272727272727,
762
+ -2.2272727272727275,
763
+ -2.2272727272727275,
764
+ -2.2272727272727275,
765
+ 0.5454545454545454,
766
+ 0.5454545454545454,
767
+ 0.5454545454545454,
768
+ 0.5454545454545454
769
+ ]
770
+ },
771
+ {
772
+ "task": "easy",
773
+ "seed": 0,
774
+ "policy": "llm",
775
+ "total_reward": 12.2,
776
+ "grader_score": 0.7709,
777
+ "success_score": 0.95,
778
+ "budget_score": 0.04,
779
+ "adaptation_score": 0.95,
780
+ "latency_score": 0.6993,
781
+ "sla_score": 1.0,
782
+ "success_rate": 0.95,
783
+ "steps": 20,
784
+ "actions": [
785
+ "route_to_a",
786
+ "route_to_b",
787
+ "route_to_b",
788
+ "route_to_b",
789
+ "route_to_b",
790
+ "route_to_b",
791
+ "route_to_b",
792
+ "route_to_b",
793
+ "route_to_b",
794
+ "route_to_b",
795
+ "route_to_b",
796
+ "route_to_b",
797
+ "route_to_b",
798
+ "route_to_b",
799
+ "route_to_b",
800
+ "route_to_b",
801
+ "route_to_b",
802
+ "route_to_b",
803
+ "route_to_b",
804
+ "route_to_b"
805
+ ],
806
+ "rewards": [
807
+ -2.05,
808
+ 0.75,
809
+ 0.75,
810
+ 0.75,
811
+ 0.75,
812
+ 0.75,
813
+ 0.75,
814
+ 0.75,
815
+ 0.75,
816
+ 0.75,
817
+ 0.75,
818
+ 0.75,
819
+ 0.75,
820
+ 0.75,
821
+ 0.75,
822
+ 0.75,
823
+ 0.75,
824
+ 0.75,
825
+ 0.75,
826
+ 0.75
827
+ ]
828
+ },
829
+ {
830
+ "task": "easy",
831
+ "seed": 1,
832
+ "policy": "llm",
833
+ "total_reward": 12.8,
834
+ "grader_score": 0.7902,
835
+ "success_score": 0.95,
836
+ "budget_score": 0.16,
837
+ "adaptation_score": 0.95,
838
+ "latency_score": 0.7058,
839
+ "sla_score": 1.0,
840
+ "success_rate": 0.95,
841
+ "steps": 20,
842
+ "actions": [
843
+ "route_to_a",
844
+ "route_to_a",
845
+ "route_to_a",
846
+ "route_to_a",
847
+ "route_to_b",
848
+ "route_to_b",
849
+ "route_to_b",
850
+ "route_to_b",
851
+ "route_to_b",
852
+ "route_to_b",
853
+ "route_to_b",
854
+ "route_to_b",
855
+ "route_to_b",
856
+ "route_to_b",
857
+ "route_to_b",
858
+ "route_to_b",
859
+ "route_to_b",
860
+ "route_to_b",
861
+ "route_to_b",
862
+ "route_to_b"
863
+ ],
864
+ "rewards": [
865
+ 0.95,
866
+ 0.95,
867
+ 0.95,
868
+ -2.05,
869
+ 0.75,
870
+ 0.75,
871
+ 0.75,
872
+ 0.75,
873
+ 0.75,
874
+ 0.75,
875
+ 0.75,
876
+ 0.75,
877
+ 0.75,
878
+ 0.75,
879
+ 0.75,
880
+ 0.75,
881
+ 0.75,
882
+ 0.75,
883
+ 0.75,
884
+ 0.75
885
+ ]
886
+ },
887
+ {
888
+ "task": "easy",
889
+ "seed": 2,
890
+ "policy": "llm",
891
+ "total_reward": -2.55,
892
+ "grader_score": 0.552,
893
+ "success_score": 0.55,
894
+ "budget_score": 0.09,
895
+ "adaptation_score": 0.55,
896
+ "latency_score": 0.5677,
897
+ "sla_score": 1.0,
898
+ "success_rate": 0.7857,
899
+ "steps": 20,
900
+ "actions": [
901
+ "route_to_a",
902
+ "route_to_b",
903
+ "route_to_b",
904
+ "route_to_b",
905
+ "route_to_b",
906
+ "route_to_b",
907
+ "route_to_b",
908
+ "route_to_b",
909
+ "route_to_b",
910
+ "route_to_c",
911
+ "route_to_c",
912
+ "route_to_c",
913
+ "route_to_c",
914
+ "route_to_c",
915
+ "shed_load",
916
+ "shed_load",
917
+ "shed_load",
918
+ "shed_load",
919
+ "shed_load",
920
+ "shed_load"
921
+ ],
922
+ "rewards": [
923
+ -2.05,
924
+ 0.75,
925
+ 0.75,
926
+ 0.75,
927
+ 0.75,
928
+ 0.75,
929
+ 0.75,
930
+ -2.25,
931
+ -2.25,
932
+ 0.5,
933
+ 0.5,
934
+ 0.5,
935
+ 0.5,
936
+ 0.5,
937
+ -0.5,
938
+ -0.5,
939
+ -0.5,
940
+ -0.5,
941
+ -0.5,
942
+ -0.5
943
+ ]
944
+ },
945
+ {
946
+ "task": "medium",
947
+ "seed": 0,
948
+ "policy": "llm",
949
+ "total_reward": 9.2632,
950
+ "grader_score": 0.7476,
951
+ "success_score": 0.9,
952
+ "budget_score": 0.0526,
953
+ "adaptation_score": 0.9333,
954
+ "latency_score": 0.6651,
955
+ "sla_score": 1.0,
956
+ "success_rate": 0.9,
957
+ "steps": 20,
958
+ "actions": [
959
+ "route_to_a",
960
+ "route_to_a",
961
+ "route_to_a",
962
+ "route_to_a",
963
+ "route_to_a",
964
+ "route_to_b",
965
+ "route_to_b",
966
+ "route_to_b",
967
+ "route_to_b",
968
+ "route_to_b",
969
+ "route_to_b",
970
+ "route_to_b",
971
+ "route_to_b",
972
+ "route_to_b",
973
+ "route_to_b",
974
+ "route_to_b",
975
+ "route_to_b",
976
+ "route_to_b",
977
+ "route_to_c",
978
+ "route_to_c"
979
+ ],
980
+ "rewards": [
981
+ 0.9473684210526315,
982
+ 0.9473684210526315,
983
+ 0.9473684210526315,
984
+ 0.9473684210526315,
985
+ -2.0526315789473686,
986
+ 0.7368421052631579,
987
+ 0.7368421052631579,
988
+ 0.7368421052631579,
989
+ 0.7368421052631579,
990
+ 0.7368421052631579,
991
+ 0.7368421052631579,
992
+ 0.7368421052631579,
993
+ 0.7368421052631579,
994
+ 0.7368421052631579,
995
+ 0.7368421052631579,
996
+ 0.7368421052631579,
997
+ 0.7368421052631579,
998
+ -2.263157894736842,
999
+ 0.4736842105263157,
1000
+ 0.4736842105263157
1001
+ ]
1002
+ },
1003
+ {
1004
+ "task": "medium",
1005
+ "seed": 1,
1006
+ "policy": "llm",
1007
+ "total_reward": -2.5789,
1008
+ "grader_score": 0.6148,
1009
+ "success_score": 0.7,
1010
+ "budget_score": 0.0842,
1011
+ "adaptation_score": 0.6667,
1012
+ "latency_score": 0.5439,
1013
+ "sla_score": 1.0,
1014
+ "success_rate": 0.7,
1015
+ "steps": 20,
1016
+ "actions": [
1017
+ "route_to_a",
1018
+ "route_to_a",
1019
+ "route_to_a",
1020
+ "route_to_a",
1021
+ "route_to_a",
1022
+ "route_to_b",
1023
+ "route_to_b",
1024
+ "route_to_b",
1025
+ "route_to_b",
1026
+ "route_to_a",
1027
+ "route_to_b",
1028
+ "route_to_b",
1029
+ "route_to_b",
1030
+ "route_to_b",
1031
+ "route_to_b",
1032
+ "route_to_a",
1033
+ "route_to_b",
1034
+ "route_to_c",
1035
+ "route_to_c",
1036
+ "route_to_c"
1037
+ ],
1038
+ "rewards": [
1039
+ 0.9473684210526315,
1040
+ 0.9473684210526315,
1041
+ 0.9473684210526315,
1042
+ 0.9473684210526315,
1043
+ -2.0526315789473686,
1044
+ 0.7368421052631579,
1045
+ 0.7368421052631579,
1046
+ 0.7368421052631579,
1047
+ -2.263157894736842,
1048
+ -2.0526315789473686,
1049
+ 0.7368421052631579,
1050
+ 0.7368421052631579,
1051
+ 0.7368421052631579,
1052
+ 0.7368421052631579,
1053
+ -2.263157894736842,
1054
+ -2.0526315789473686,
1055
+ -2.263157894736842,
1056
+ 0.4736842105263157,
1057
+ 0.4736842105263157,
1058
+ 0.4736842105263157
1059
+ ]
1060
+ },
1061
+ {
1062
+ "task": "medium",
1063
+ "seed": 2,
1064
+ "policy": "llm",
1065
+ "total_reward": -3.2368,
1066
+ "grader_score": 0.6052,
1067
+ "success_score": 0.45,
1068
+ "budget_score": 0.2526,
1069
+ "adaptation_score": 0.8333,
1070
+ "latency_score": 0.5783,
1071
+ "sla_score": 1.0,
1072
+ "success_rate": 0.8182,
1073
+ "steps": 20,
1074
+ "actions": [
1075
+ "route_to_a",
1076
+ "route_to_b",
1077
+ "route_to_b",
1078
+ "route_to_b",
1079
+ "route_to_b",
1080
+ "route_to_b",
1081
+ "route_to_b",
1082
+ "route_to_c",
1083
+ "route_to_c",
1084
+ "route_to_c",
1085
+ "route_to_c",
1086
+ "shed_load",
1087
+ "shed_load",
1088
+ "shed_load",
1089
+ "shed_load",
1090
+ "shed_load",
1091
+ "shed_load",
1092
+ "shed_load",
1093
+ "shed_load",
1094
+ "shed_load"
1095
+ ],
1096
+ "rewards": [
1097
+ -2.0526315789473686,
1098
+ 0.7368421052631579,
1099
+ 0.7368421052631579,
1100
+ 0.7368421052631579,
1101
+ 0.7368421052631579,
1102
+ 0.7368421052631579,
1103
+ -2.263157894736842,
1104
+ 0.4736842105263157,
1105
+ 0.4736842105263157,
1106
+ 0.4736842105263157,
1107
+ 0.4736842105263157,
1108
+ -0.5,
1109
+ -0.5,
1110
+ -0.5,
1111
+ -0.5,
1112
+ -0.5,
1113
+ -0.5,
1114
+ -0.5,
1115
+ -0.5,
1116
+ -0.5
1117
+ ]
1118
+ },
1119
+ {
1120
+ "task": "hard",
1121
+ "seed": 0,
1122
+ "policy": "llm",
1123
+ "total_reward": -2.8235,
1124
+ "grader_score": 0.5954,
1125
+ "success_score": 0.5,
1126
+ "budget_score": 0.0353,
1127
+ "adaptation_score": 0.8889,
1128
+ "latency_score": 0.5615,
1129
+ "sla_score": 1.0,
1130
+ "success_rate": 0.8333,
1131
+ "steps": 20,
1132
+ "actions": [
1133
+ "route_to_a",
1134
+ "route_to_a",
1135
+ "route_to_b",
1136
+ "route_to_b",
1137
+ "route_to_b",
1138
+ "route_to_b",
1139
+ "route_to_c",
1140
+ "route_to_c",
1141
+ "route_to_c",
1142
+ "route_to_c",
1143
+ "route_to_c",
1144
+ "shed_load",
1145
+ "route_to_c",
1146
+ "shed_load",
1147
+ "shed_load",
1148
+ "shed_load",
1149
+ "shed_load",
1150
+ "shed_load",
1151
+ "shed_load",
1152
+ "shed_load"
1153
+ ],
1154
+ "rewards": [
1155
+ 0.9411764705882353,
1156
+ -2.0588235294117645,
1157
+ 0.7058823529411764,
1158
+ 0.7058823529411764,
1159
+ 0.7058823529411764,
1160
+ -2.2941176470588234,
1161
+ 0.4117647058823529,
1162
+ 0.4117647058823529,
1163
+ 0.4117647058823529,
1164
+ 0.4117647058823529,
1165
+ 0.4117647058823529,
1166
+ -0.5,
1167
+ 0.4117647058823529,
1168
+ -0.5,
1169
+ -0.5,
1170
+ -0.5,
1171
+ -0.5,
1172
+ -0.5,
1173
+ -0.5,
1174
+ -0.5
1175
+ ]
1176
+ },
1177
+ {
1178
+ "task": "hard",
1179
+ "seed": 1,
1180
+ "policy": "llm",
1181
+ "total_reward": 4.6176,
1182
+ "grader_score": 0.69,
1183
+ "success_score": 0.75,
1184
+ "budget_score": 0.0235,
1185
+ "adaptation_score": 0.875,
1186
+ "latency_score": 0.6823,
1187
+ "sla_score": 1.0,
1188
+ "success_rate": 0.8824,
1189
+ "steps": 20,
1190
+ "actions": [
1191
+ "route_to_a",
1192
+ "route_to_a",
1193
+ "route_to_a",
1194
+ "route_to_b",
1195
+ "route_to_b",
1196
+ "route_to_b",
1197
+ "route_to_b",
1198
+ "route_to_b",
1199
+ "route_to_b",
1200
+ "route_to_b",
1201
+ "route_to_b",
1202
+ "route_to_b",
1203
+ "route_to_b",
1204
+ "route_to_b",
1205
+ "route_to_b",
1206
+ "route_to_c",
1207
+ "route_to_c",
1208
+ "shed_load",
1209
+ "shed_load",
1210
+ "shed_load"
1211
+ ],
1212
+ "rewards": [
1213
+ 0.9411764705882353,
1214
+ 0.9411764705882353,
1215
+ -2.0588235294117645,
1216
+ 0.7058823529411764,
1217
+ 0.7058823529411764,
1218
+ 0.7058823529411764,
1219
+ 0.7058823529411764,
1220
+ 0.7058823529411764,
1221
+ 0.7058823529411764,
1222
+ 0.7058823529411764,
1223
+ 0.7058823529411764,
1224
+ 0.7058823529411764,
1225
+ 0.7058823529411764,
1226
+ 0.7058823529411764,
1227
+ -2.2941176470588234,
1228
+ 0.4117647058823529,
1229
+ 0.4117647058823529,
1230
+ -0.5,
1231
+ -0.5,
1232
+ -0.5
1233
+ ]
1234
+ },
1235
+ {
1236
+ "task": "hard",
1237
+ "seed": 2,
1238
+ "policy": "llm",
1239
+ "total_reward": -3.9904,
1240
+ "grader_score": 0.5735,
1241
+ "success_score": 0.45,
1242
+ "budget_score": 0.1059,
1243
+ "adaptation_score": 0.875,
1244
+ "latency_score": 0.556,
1245
+ "sla_score": 0.9091,
1246
+ "success_rate": 0.8182,
1247
+ "steps": 20,
1248
+ "actions": [
1249
+ "route_to_a",
1250
+ "route_to_b",
1251
+ "route_to_b",
1252
+ "route_to_b",
1253
+ "route_to_b",
1254
+ "route_to_b",
1255
+ "route_to_c",
1256
+ "route_to_c",
1257
+ "route_to_c",
1258
+ "route_to_c",
1259
+ "route_to_c",
1260
+ "shed_load",
1261
+ "shed_load",
1262
+ "shed_load",
1263
+ "shed_load",
1264
+ "shed_load",
1265
+ "shed_load",
1266
+ "shed_load",
1267
+ "shed_load",
1268
+ "shed_load"
1269
+ ],
1270
+ "rewards": [
1271
+ -2.0588235294117645,
1272
+ 0.7058823529411764,
1273
+ 0.7058823529411764,
1274
+ 0.7058823529411764,
1275
+ 0.7058823529411764,
1276
+ -2.3138982657484135,
1277
+ 0.4117647058823529,
1278
+ 0.4117647058823529,
1279
+ 0.4117647058823529,
1280
+ 0.4117647058823529,
1281
+ 0.4117647058823529,
1282
+ -0.5,
1283
+ -0.5,
1284
+ -0.5,
1285
+ -0.5,
1286
+ -0.5,
1287
+ -0.5,
1288
+ -0.5,
1289
+ -0.5,
1290
+ -0.5
1291
+ ]
1292
+ },
1293
+ {
1294
+ "task": "hard_multi",
1295
+ "seed": 0,
1296
+ "policy": "llm",
1297
+ "total_reward": 4.7727,
1298
+ "grader_score": 0.6818,
1299
+ "success_score": 0.75,
1300
+ "budget_score": 0.0545,
1301
+ "adaptation_score": 0.8571,
1302
+ "latency_score": 0.6358,
1303
+ "sla_score": 1.0,
1304
+ "success_rate": 0.8824,
1305
+ "steps": 20,
1306
+ "actions": [
1307
+ "route_to_a",
1308
+ "route_to_a",
1309
+ "route_to_a",
1310
+ "route_to_a",
1311
+ "route_to_b",
1312
+ "route_to_b",
1313
+ "route_to_b",
1314
+ "route_to_b",
1315
+ "route_to_b",
1316
+ "route_to_b",
1317
+ "route_to_c",
1318
+ "route_to_c",
1319
+ "route_to_c",
1320
+ "route_to_c",
1321
+ "route_to_c",
1322
+ "route_to_c",
1323
+ "route_to_c",
1324
+ "shed_load",
1325
+ "shed_load",
1326
+ "shed_load"
1327
+ ],
1328
+ "rewards": [
1329
+ 0.9545454545454546,
1330
+ 0.9545454545454546,
1331
+ 0.9545454545454546,
1332
+ -2.0454545454545454,
1333
+ 0.7727272727272727,
1334
+ 0.7727272727272727,
1335
+ 0.7727272727272727,
1336
+ 0.7727272727272727,
1337
+ 0.7727272727272727,
1338
+ -2.2272727272727275,
1339
+ 0.5454545454545454,
1340
+ 0.5454545454545454,
1341
+ 0.5454545454545454,
1342
+ 0.5454545454545454,
1343
+ 0.5454545454545454,
1344
+ 0.5454545454545454,
1345
+ 0.5454545454545454,
1346
+ -0.5,
1347
+ -0.5,
1348
+ -0.5
1349
+ ]
1350
+ },
1351
+ {
1352
+ "task": "hard_multi",
1353
+ "seed": 1,
1354
+ "policy": "llm",
1355
+ "total_reward": 6.1364,
1356
+ "grader_score": 0.6994,
1357
+ "success_score": 0.8,
1358
+ "budget_score": 0.0273,
1359
+ "adaptation_score": 0.8786,
1360
+ "latency_score": 0.648,
1361
+ "sla_score": 1.0,
1362
+ "success_rate": 0.8889,
1363
+ "steps": 20,
1364
+ "actions": [
1365
+ "route_to_a",
1366
+ "route_to_a",
1367
+ "route_to_b",
1368
+ "route_to_b",
1369
+ "route_to_b",
1370
+ "route_to_b",
1371
+ "route_to_b",
1372
+ "route_to_b",
1373
+ "route_to_b",
1374
+ "route_to_b",
1375
+ "route_to_b",
1376
+ "route_to_b",
1377
+ "route_to_b",
1378
+ "route_to_c",
1379
+ "route_to_c",
1380
+ "route_to_c",
1381
+ "route_to_c",
1382
+ "route_to_c",
1383
+ "shed_load",
1384
+ "shed_load"
1385
+ ],
1386
+ "rewards": [
1387
+ 0.9545454545454546,
1388
+ -2.0454545454545454,
1389
+ 0.7727272727272727,
1390
+ 0.7727272727272727,
1391
+ 0.7727272727272727,
1392
+ 0.7727272727272727,
1393
+ 0.7727272727272727,
1394
+ 0.7727272727272727,
1395
+ 0.7727272727272727,
1396
+ 0.7727272727272727,
1397
+ 0.7727272727272727,
1398
+ 0.7727272727272727,
1399
+ -2.2272727272727275,
1400
+ 0.5454545454545454,
1401
+ 0.5454545454545454,
1402
+ 0.5454545454545454,
1403
+ 0.5454545454545454,
1404
+ 0.5454545454545454,
1405
+ -0.5,
1406
+ -0.5
1407
+ ]
1408
+ },
1409
+ {
1410
+ "task": "hard_multi",
1411
+ "seed": 2,
1412
+ "policy": "llm",
1413
+ "total_reward": 7.9545,
1414
+ "grader_score": 0.7156,
1415
+ "success_score": 0.85,
1416
+ "budget_score": 0.0909,
1417
+ "adaptation_score": 0.8667,
1418
+ "latency_score": 0.6181,
1419
+ "sla_score": 1.0,
1420
+ "success_rate": 0.8947,
1421
+ "steps": 20,
1422
+ "actions": [
1423
+ "route_to_a",
1424
+ "route_to_a",
1425
+ "route_to_a",
1426
+ "route_to_a",
1427
+ "route_to_a",
1428
+ "route_to_b",
1429
+ "route_to_b",
1430
+ "route_to_b",
1431
+ "route_to_b",
1432
+ "route_to_b",
1433
+ "route_to_b",
1434
+ "route_to_b",
1435
+ "route_to_b",
1436
+ "route_to_b",
1437
+ "route_to_c",
1438
+ "route_to_c",
1439
+ "route_to_c",
1440
+ "route_to_c",
1441
+ "route_to_c",
1442
+ "shed_load"
1443
+ ],
1444
+ "rewards": [
1445
+ 0.9545454545454546,
1446
+ 0.9545454545454546,
1447
+ 0.9545454545454546,
1448
+ 0.9545454545454546,
1449
+ -2.0454545454545454,
1450
+ 0.7727272727272727,
1451
+ 0.7727272727272727,
1452
+ 0.7727272727272727,
1453
+ 0.7727272727272727,
1454
+ 0.7727272727272727,
1455
+ 0.7727272727272727,
1456
+ 0.7727272727272727,
1457
+ 0.7727272727272727,
1458
+ -2.2272727272727275,
1459
+ 0.5454545454545454,
1460
+ 0.5454545454545454,
1461
+ 0.5454545454545454,
1462
+ 0.5454545454545454,
1463
+ 0.5454545454545454,
1464
+ -0.5
1465
+ ]
1466
+ }
1467
+ ]
1468
+ }
eval/outputs/prompt_audit/budget_guard_alltasks_dev3/eval_summary_20260425_165910.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Budget Router Evaluation — 20260425_165910
2
+
3
+ | Task | HEURISTIC Grader | LLM Grader | Notes |
4
+ |---|---|---|---|
5
+ | Easy | 0.7734 (n=3) | 0.7044 (n=3) | |
6
+ | Medium | 0.6187 (n=3) | 0.6559 (n=3) | |
7
+ | Hard | 0.5491 (n=3) | 0.6196 (n=3) | |
8
+ | Hard_Multi | 0.5937 (n=3) | 0.6989 (n=3) | LLM +10.5 points vs heuristic |
eval/outputs/prompt_audit/budget_guard_dev10/eval_results_20260425_164343.json ADDED
@@ -0,0 +1,1202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "timestamp": "20260425_164343",
4
+ "policies": [
5
+ "heuristic",
6
+ "llm"
7
+ ],
8
+ "tasks": [
9
+ "hard_multi"
10
+ ],
11
+ "seeds": [
12
+ 0,
13
+ 1,
14
+ 2,
15
+ 3,
16
+ 4,
17
+ 5,
18
+ 6,
19
+ 7,
20
+ 8,
21
+ 9
22
+ ]
23
+ },
24
+ "summary": {
25
+ "hard_multi|heuristic": {
26
+ "grader_mean": 0.6078,
27
+ "reward_mean": -2.9709,
28
+ "success_rate": 0.6998,
29
+ "adaptation": 0.6907,
30
+ "n": 10
31
+ },
32
+ "hard_multi|llm": {
33
+ "grader_mean": 0.6888,
34
+ "reward_mean": 4.7955,
35
+ "success_rate": 0.8722,
36
+ "adaptation": 0.8882,
37
+ "n": 10
38
+ }
39
+ },
40
+ "episodes": [
41
+ {
42
+ "task": "hard_multi",
43
+ "seed": 0,
44
+ "policy": "heuristic",
45
+ "total_reward": -4.4659,
46
+ "grader_score": 0.5569,
47
+ "success_score": 0.65,
48
+ "budget_score": 0.0364,
49
+ "adaptation_score": 0.6032,
50
+ "latency_score": 0.4686,
51
+ "sla_score": 0.9474,
52
+ "success_rate": 0.6842,
53
+ "steps": 20,
54
+ "actions": [
55
+ "route_to_a",
56
+ "route_to_a",
57
+ "route_to_a",
58
+ "route_to_a",
59
+ "route_to_a",
60
+ "route_to_a",
61
+ "route_to_b",
62
+ "route_to_b",
63
+ "route_to_b",
64
+ "route_to_b",
65
+ "route_to_b",
66
+ "route_to_b",
67
+ "route_to_c",
68
+ "route_to_c",
69
+ "route_to_c",
70
+ "route_to_c",
71
+ "route_to_c",
72
+ "route_to_c",
73
+ "route_to_c",
74
+ "shed_load"
75
+ ],
76
+ "rewards": [
77
+ 0.9545454545454546,
78
+ 0.9545454545454546,
79
+ 0.9545454545454546,
80
+ -2.0454545454545454,
81
+ -2.0454545454545454,
82
+ -2.0454545454545454,
83
+ 0.7727272727272727,
84
+ 0.7727272727272727,
85
+ 0.7727272727272727,
86
+ -2.2272727272727275,
87
+ -2.2272727272727275,
88
+ -2.3750364951788474,
89
+ 0.5454545454545454,
90
+ 0.5454545454545454,
91
+ 0.5454545454545454,
92
+ 0.5454545454545454,
93
+ 0.5454545454545454,
94
+ 0.5454545454545454,
95
+ 0.5454545454545454,
96
+ -0.5
97
+ ]
98
+ },
99
+ {
100
+ "task": "hard_multi",
101
+ "seed": 1,
102
+ "policy": "heuristic",
103
+ "total_reward": -2.7727,
104
+ "grader_score": 0.6077,
105
+ "success_score": 0.7,
106
+ "budget_score": 0.0455,
107
+ "adaptation_score": 0.6833,
108
+ "latency_score": 0.5213,
109
+ "sla_score": 1.0,
110
+ "success_rate": 0.7,
111
+ "steps": 20,
112
+ "actions": [
113
+ "route_to_a",
114
+ "route_to_a",
115
+ "route_to_a",
116
+ "route_to_a",
117
+ "route_to_a",
118
+ "route_to_b",
119
+ "route_to_b",
120
+ "route_to_b",
121
+ "route_to_b",
122
+ "route_to_b",
123
+ "route_to_b",
124
+ "route_to_b",
125
+ "route_to_b",
126
+ "route_to_b",
127
+ "route_to_b",
128
+ "route_to_c",
129
+ "route_to_c",
130
+ "route_to_c",
131
+ "route_to_c",
132
+ "route_to_c"
133
+ ],
134
+ "rewards": [
135
+ 0.9545454545454546,
136
+ -2.0454545454545454,
137
+ 0.9545454545454546,
138
+ -2.0454545454545454,
139
+ -2.0454545454545454,
140
+ 0.7727272727272727,
141
+ 0.7727272727272727,
142
+ 0.7727272727272727,
143
+ 0.7727272727272727,
144
+ 0.7727272727272727,
145
+ 0.7727272727272727,
146
+ 0.7727272727272727,
147
+ -2.2272727272727275,
148
+ -2.2272727272727275,
149
+ -2.2272727272727275,
150
+ 0.5454545454545454,
151
+ 0.5454545454545454,
152
+ 0.5454545454545454,
153
+ 0.5454545454545454,
154
+ 0.5454545454545454
155
+ ]
156
+ },
157
+ {
158
+ "task": "hard_multi",
159
+ "seed": 2,
160
+ "policy": "heuristic",
161
+ "total_reward": -2.0,
162
+ "grader_score": 0.6165,
163
+ "success_score": 0.7,
164
+ "budget_score": 0.2,
165
+ "adaptation_score": 0.6357,
166
+ "latency_score": 0.4967,
167
+ "sla_score": 1.0,
168
+ "success_rate": 0.7,
169
+ "steps": 20,
170
+ "actions": [
171
+ "route_to_a",
172
+ "route_to_a",
173
+ "route_to_a",
174
+ "route_to_a",
175
+ "route_to_a",
176
+ "route_to_a",
177
+ "route_to_a",
178
+ "route_to_a",
179
+ "route_to_b",
180
+ "route_to_b",
181
+ "route_to_b",
182
+ "route_to_b",
183
+ "route_to_b",
184
+ "route_to_b",
185
+ "route_to_b",
186
+ "route_to_b",
187
+ "route_to_c",
188
+ "route_to_c",
189
+ "route_to_c",
190
+ "route_to_c"
191
+ ],
192
+ "rewards": [
193
+ 0.9545454545454546,
194
+ 0.9545454545454546,
195
+ 0.9545454545454546,
196
+ 0.9545454545454546,
197
+ -2.0454545454545454,
198
+ -2.0454545454545454,
199
+ 0.9545454545454546,
200
+ -2.0454545454545454,
201
+ 0.7727272727272727,
202
+ 0.7727272727272727,
203
+ 0.7727272727272727,
204
+ 0.7727272727272727,
205
+ 0.7727272727272727,
206
+ -2.2272727272727275,
207
+ -2.2272727272727275,
208
+ -2.2272727272727275,
209
+ 0.5454545454545454,
210
+ 0.5454545454545454,
211
+ 0.5454545454545454,
212
+ 0.5454545454545454
213
+ ]
214
+ },
215
+ {
216
+ "task": "hard_multi",
217
+ "seed": 3,
218
+ "policy": "heuristic",
219
+ "total_reward": -1.9895,
220
+ "grader_score": 0.6289,
221
+ "success_score": 0.7,
222
+ "budget_score": 0.2091,
223
+ "adaptation_score": 0.6833,
224
+ "latency_score": 0.5416,
225
+ "sla_score": 0.95,
226
+ "success_rate": 0.7,
227
+ "steps": 20,
228
+ "actions": [
229
+ "route_to_a",
230
+ "route_to_a",
231
+ "route_to_a",
232
+ "route_to_a",
233
+ "route_to_a",
234
+ "route_to_a",
235
+ "route_to_a",
236
+ "route_to_b",
237
+ "route_to_b",
238
+ "route_to_b",
239
+ "route_to_b",
240
+ "route_to_b",
241
+ "route_to_b",
242
+ "route_to_b",
243
+ "route_to_b",
244
+ "route_to_b",
245
+ "route_to_b",
246
+ "route_to_c",
247
+ "route_to_c",
248
+ "route_to_c"
249
+ ],
250
+ "rewards": [
251
+ 0.9545454545454546,
252
+ 0.9545454545454546,
253
+ 0.9545454545454546,
254
+ -2.0454545454545454,
255
+ 0.9545454545454546,
256
+ -2.0454545454545454,
257
+ -2.0454545454545454,
258
+ 0.7727272727272727,
259
+ 0.7727272727272727,
260
+ 0.7727272727272727,
261
+ 0.7727272727272727,
262
+ 0.7727272727272727,
263
+ 0.7727272727272727,
264
+ 0.7727272727272727,
265
+ -2.2272727272727275,
266
+ -2.262190038025986,
267
+ -2.2272727272727275,
268
+ 0.5454545454545454,
269
+ 0.5454545454545454,
270
+ 0.5454545454545454
271
+ ]
272
+ },
273
+ {
274
+ "task": "hard_multi",
275
+ "seed": 4,
276
+ "policy": "heuristic",
277
+ "total_reward": -4.0909,
278
+ "grader_score": 0.5933,
279
+ "success_score": 0.65,
280
+ "budget_score": 0.0818,
281
+ "adaptation_score": 0.6625,
282
+ "latency_score": 0.5175,
283
+ "sla_score": 1.0,
284
+ "success_rate": 0.6842,
285
+ "steps": 20,
286
+ "actions": [
287
+ "route_to_a",
288
+ "route_to_a",
289
+ "route_to_a",
290
+ "route_to_a",
291
+ "route_to_a",
292
+ "route_to_a",
293
+ "route_to_b",
294
+ "route_to_b",
295
+ "route_to_b",
296
+ "route_to_b",
297
+ "route_to_b",
298
+ "route_to_b",
299
+ "route_to_b",
300
+ "route_to_c",
301
+ "route_to_c",
302
+ "route_to_c",
303
+ "route_to_c",
304
+ "route_to_c",
305
+ "route_to_c",
306
+ "shed_load"
307
+ ],
308
+ "rewards": [
309
+ 0.9545454545454546,
310
+ -2.0454545454545454,
311
+ 0.9545454545454546,
312
+ 0.9545454545454546,
313
+ -2.0454545454545454,
314
+ -2.0454545454545454,
315
+ 0.7727272727272727,
316
+ 0.7727272727272727,
317
+ 0.7727272727272727,
318
+ -2.2272727272727275,
319
+ 0.7727272727272727,
320
+ -2.2272727272727275,
321
+ -2.2272727272727275,
322
+ 0.5454545454545454,
323
+ 0.5454545454545454,
324
+ 0.5454545454545454,
325
+ 0.5454545454545454,
326
+ 0.5454545454545454,
327
+ 0.5454545454545454,
328
+ -0.5
329
+ ]
330
+ },
331
+ {
332
+ "task": "hard_multi",
333
+ "seed": 5,
334
+ "policy": "heuristic",
335
+ "total_reward": -1.4024,
336
+ "grader_score": 0.607,
337
+ "success_score": 0.65,
338
+ "budget_score": 0.0364,
339
+ "adaptation_score": 0.8125,
340
+ "latency_score": 0.5142,
341
+ "sla_score": 0.9412,
342
+ "success_rate": 0.7647,
343
+ "steps": 20,
344
+ "actions": [
345
+ "route_to_a",
346
+ "route_to_b",
347
+ "route_to_b",
348
+ "route_to_b",
349
+ "route_to_b",
350
+ "route_to_b",
351
+ "route_to_b",
352
+ "route_to_b",
353
+ "route_to_b",
354
+ "route_to_b",
355
+ "route_to_b",
356
+ "route_to_b",
357
+ "route_to_c",
358
+ "route_to_c",
359
+ "route_to_c",
360
+ "route_to_c",
361
+ "route_to_c",
362
+ "shed_load",
363
+ "shed_load",
364
+ "shed_load"
365
+ ],
366
+ "rewards": [
367
+ -2.0454545454545454,
368
+ 0.7727272727272727,
369
+ 0.7727272727272727,
370
+ 0.7727272727272727,
371
+ 0.7727272727272727,
372
+ 0.7727272727272727,
373
+ 0.7727272727272727,
374
+ 0.7727272727272727,
375
+ -2.2272727272727275,
376
+ 0.7727272727272727,
377
+ -2.2272727272727275,
378
+ -2.311463428136077,
379
+ 0.5454545454545454,
380
+ 0.5454545454545454,
381
+ 0.5454545454545454,
382
+ 0.5454545454545454,
383
+ 0.5454545454545454,
384
+ -0.5,
385
+ -0.5,
386
+ -0.5
387
+ ]
388
+ },
389
+ {
390
+ "task": "hard_multi",
391
+ "seed": 6,
392
+ "policy": "heuristic",
393
+ "total_reward": -3.7273,
394
+ "grader_score": 0.6546,
395
+ "success_score": 0.65,
396
+ "budget_score": 0.4545,
397
+ "adaptation_score": 0.6458,
398
+ "latency_score": 0.5611,
399
+ "sla_score": 1.0,
400
+ "success_rate": 0.65,
401
+ "steps": 20,
402
+ "actions": [
403
+ "route_to_a",
404
+ "route_to_a",
405
+ "route_to_a",
406
+ "route_to_a",
407
+ "route_to_a",
408
+ "route_to_a",
409
+ "route_to_a",
410
+ "route_to_a",
411
+ "route_to_a",
412
+ "route_to_a",
413
+ "route_to_b",
414
+ "route_to_b",
415
+ "route_to_b",
416
+ "route_to_b",
417
+ "route_to_b",
418
+ "route_to_b",
419
+ "route_to_b",
420
+ "route_to_b",
421
+ "route_to_b",
422
+ "route_to_b"
423
+ ],
424
+ "rewards": [
425
+ 0.9545454545454546,
426
+ 0.9545454545454546,
427
+ -2.0454545454545454,
428
+ 0.9545454545454546,
429
+ 0.9545454545454546,
430
+ 0.9545454545454546,
431
+ 0.9545454545454546,
432
+ -2.0454545454545454,
433
+ -2.0454545454545454,
434
+ -2.0454545454545454,
435
+ 0.7727272727272727,
436
+ 0.7727272727272727,
437
+ 0.7727272727272727,
438
+ 0.7727272727272727,
439
+ 0.7727272727272727,
440
+ 0.7727272727272727,
441
+ -2.2272727272727275,
442
+ -2.2272727272727275,
443
+ 0.7727272727272727,
444
+ -2.2272727272727275
445
+ ]
446
+ },
447
+ {
448
+ "task": "hard_multi",
449
+ "seed": 7,
450
+ "policy": "heuristic",
451
+ "total_reward": 0.1818,
452
+ "grader_score": 0.6477,
453
+ "success_score": 0.7,
454
+ "budget_score": 0.0364,
455
+ "adaptation_score": 0.85,
456
+ "latency_score": 0.5613,
457
+ "sla_score": 1.0,
458
+ "success_rate": 0.7778,
459
+ "steps": 20,
460
+ "actions": [
461
+ "route_to_a",
462
+ "route_to_b",
463
+ "route_to_b",
464
+ "route_to_b",
465
+ "route_to_b",
466
+ "route_to_b",
467
+ "route_to_b",
468
+ "route_to_b",
469
+ "route_to_b",
470
+ "route_to_b",
471
+ "route_to_b",
472
+ "route_to_b",
473
+ "route_to_b",
474
+ "route_to_b",
475
+ "route_to_c",
476
+ "route_to_c",
477
+ "route_to_c",
478
+ "route_to_c",
479
+ "shed_load",
480
+ "shed_load"
481
+ ],
482
+ "rewards": [
483
+ -2.0454545454545454,
484
+ 0.7727272727272727,
485
+ 0.7727272727272727,
486
+ 0.7727272727272727,
487
+ 0.7727272727272727,
488
+ 0.7727272727272727,
489
+ 0.7727272727272727,
490
+ 0.7727272727272727,
491
+ 0.7727272727272727,
492
+ -2.2272727272727275,
493
+ -2.2272727272727275,
494
+ 0.7727272727272727,
495
+ 0.7727272727272727,
496
+ -2.2272727272727275,
497
+ 0.5454545454545454,
498
+ 0.5454545454545454,
499
+ 0.5454545454545454,
500
+ 0.5454545454545454,
501
+ -0.5,
502
+ -0.5
503
+ ]
504
+ },
505
+ {
506
+ "task": "hard_multi",
507
+ "seed": 8,
508
+ "policy": "heuristic",
509
+ "total_reward": -8.3509,
510
+ "grader_score": 0.5338,
511
+ "success_score": 0.6,
512
+ "budget_score": 0.2,
513
+ "adaptation_score": 0.5682,
514
+ "latency_score": 0.4135,
515
+ "sla_score": 0.85,
516
+ "success_rate": 0.6,
517
+ "steps": 20,
518
+ "actions": [
519
+ "route_to_a",
520
+ "route_to_a",
521
+ "route_to_a",
522
+ "route_to_a",
523
+ "route_to_a",
524
+ "route_to_a",
525
+ "route_to_a",
526
+ "route_to_a",
527
+ "route_to_b",
528
+ "route_to_b",
529
+ "route_to_b",
530
+ "route_to_b",
531
+ "route_to_b",
532
+ "route_to_b",
533
+ "route_to_b",
534
+ "route_to_b",
535
+ "route_to_c",
536
+ "route_to_c",
537
+ "route_to_c",
538
+ "route_to_c"
539
+ ],
540
+ "rewards": [
541
+ 0.9545454545454546,
542
+ -2.0454545454545454,
543
+ 0.9545454545454546,
544
+ 0.9545454545454546,
545
+ 0.9545454545454546,
546
+ -2.0454545454545454,
547
+ -2.0454545454545454,
548
+ -2.1359667972034475,
549
+ 0.7727272727272727,
550
+ -2.2272727272727275,
551
+ 0.7727272727272727,
552
+ -2.4320645744998868,
553
+ 0.7727272727272727,
554
+ 0.7727272727272727,
555
+ -2.2272727272727275,
556
+ -2.2828540896362535,
557
+ 0.5454545454545454,
558
+ 0.5454545454545454,
559
+ 0.5454545454545454,
560
+ 0.5454545454545454
561
+ ]
562
+ },
563
+ {
564
+ "task": "hard_multi",
565
+ "seed": 9,
566
+ "policy": "heuristic",
567
+ "total_reward": -1.0909,
568
+ "grader_score": 0.6315,
569
+ "success_score": 0.7,
570
+ "budget_score": 0.0818,
571
+ "adaptation_score": 0.7625,
572
+ "latency_score": 0.5336,
573
+ "sla_score": 1.0,
574
+ "success_rate": 0.7368,
575
+ "steps": 20,
576
+ "actions": [
577
+ "route_to_a",
578
+ "route_to_b",
579
+ "route_to_b",
580
+ "route_to_b",
581
+ "route_to_b",
582
+ "route_to_b",
583
+ "route_to_b",
584
+ "route_to_b",
585
+ "route_to_b",
586
+ "route_to_b",
587
+ "route_to_b",
588
+ "route_to_b",
589
+ "route_to_b",
590
+ "route_to_b",
591
+ "route_to_b",
592
+ "route_to_b",
593
+ "route_to_b",
594
+ "route_to_c",
595
+ "route_to_c",
596
+ "shed_load"
597
+ ],
598
+ "rewards": [
599
+ -2.0454545454545454,
600
+ 0.7727272727272727,
601
+ 0.7727272727272727,
602
+ 0.7727272727272727,
603
+ 0.7727272727272727,
604
+ 0.7727272727272727,
605
+ 0.7727272727272727,
606
+ 0.7727272727272727,
607
+ 0.7727272727272727,
608
+ 0.7727272727272727,
609
+ -2.2272727272727275,
610
+ 0.7727272727272727,
611
+ 0.7727272727272727,
612
+ -2.2272727272727275,
613
+ 0.7727272727272727,
614
+ -2.2272727272727275,
615
+ -2.2272727272727275,
616
+ 0.5454545454545454,
617
+ 0.5454545454545454,
618
+ -0.5
619
+ ]
620
+ },
621
+ {
622
+ "task": "hard_multi",
623
+ "seed": 0,
624
+ "policy": "llm",
625
+ "total_reward": 4.7727,
626
+ "grader_score": 0.6818,
627
+ "success_score": 0.75,
628
+ "budget_score": 0.0545,
629
+ "adaptation_score": 0.8571,
630
+ "latency_score": 0.6358,
631
+ "sla_score": 1.0,
632
+ "success_rate": 0.8824,
633
+ "steps": 20,
634
+ "actions": [
635
+ "route_to_a",
636
+ "route_to_a",
637
+ "route_to_a",
638
+ "route_to_a",
639
+ "route_to_b",
640
+ "route_to_b",
641
+ "route_to_b",
642
+ "route_to_b",
643
+ "route_to_b",
644
+ "route_to_b",
645
+ "route_to_c",
646
+ "route_to_c",
647
+ "route_to_c",
648
+ "route_to_c",
649
+ "route_to_c",
650
+ "route_to_c",
651
+ "route_to_c",
652
+ "shed_load",
653
+ "shed_load",
654
+ "shed_load"
655
+ ],
656
+ "rewards": [
657
+ 0.9545454545454546,
658
+ 0.9545454545454546,
659
+ 0.9545454545454546,
660
+ -2.0454545454545454,
661
+ 0.7727272727272727,
662
+ 0.7727272727272727,
663
+ 0.7727272727272727,
664
+ 0.7727272727272727,
665
+ 0.7727272727272727,
666
+ -2.2272727272727275,
667
+ 0.5454545454545454,
668
+ 0.5454545454545454,
669
+ 0.5454545454545454,
670
+ 0.5454545454545454,
671
+ 0.5454545454545454,
672
+ 0.5454545454545454,
673
+ 0.5454545454545454,
674
+ -0.5,
675
+ -0.5,
676
+ -0.5
677
+ ]
678
+ },
679
+ {
680
+ "task": "hard_multi",
681
+ "seed": 1,
682
+ "policy": "llm",
683
+ "total_reward": 6.1364,
684
+ "grader_score": 0.6994,
685
+ "success_score": 0.8,
686
+ "budget_score": 0.0273,
687
+ "adaptation_score": 0.8786,
688
+ "latency_score": 0.648,
689
+ "sla_score": 1.0,
690
+ "success_rate": 0.8889,
691
+ "steps": 20,
692
+ "actions": [
693
+ "route_to_a",
694
+ "route_to_a",
695
+ "route_to_b",
696
+ "route_to_b",
697
+ "route_to_b",
698
+ "route_to_b",
699
+ "route_to_b",
700
+ "route_to_b",
701
+ "route_to_b",
702
+ "route_to_b",
703
+ "route_to_b",
704
+ "route_to_b",
705
+ "route_to_b",
706
+ "route_to_c",
707
+ "route_to_c",
708
+ "route_to_c",
709
+ "route_to_c",
710
+ "route_to_c",
711
+ "shed_load",
712
+ "shed_load"
713
+ ],
714
+ "rewards": [
715
+ 0.9545454545454546,
716
+ -2.0454545454545454,
717
+ 0.7727272727272727,
718
+ 0.7727272727272727,
719
+ 0.7727272727272727,
720
+ 0.7727272727272727,
721
+ 0.7727272727272727,
722
+ 0.7727272727272727,
723
+ 0.7727272727272727,
724
+ 0.7727272727272727,
725
+ 0.7727272727272727,
726
+ 0.7727272727272727,
727
+ -2.2272727272727275,
728
+ 0.5454545454545454,
729
+ 0.5454545454545454,
730
+ 0.5454545454545454,
731
+ 0.5454545454545454,
732
+ 0.5454545454545454,
733
+ -0.5,
734
+ -0.5
735
+ ]
736
+ },
737
+ {
738
+ "task": "hard_multi",
739
+ "seed": 2,
740
+ "policy": "llm",
741
+ "total_reward": 7.9545,
742
+ "grader_score": 0.7156,
743
+ "success_score": 0.85,
744
+ "budget_score": 0.0909,
745
+ "adaptation_score": 0.8667,
746
+ "latency_score": 0.6181,
747
+ "sla_score": 1.0,
748
+ "success_rate": 0.8947,
749
+ "steps": 20,
750
+ "actions": [
751
+ "route_to_a",
752
+ "route_to_a",
753
+ "route_to_a",
754
+ "route_to_a",
755
+ "route_to_a",
756
+ "route_to_b",
757
+ "route_to_b",
758
+ "route_to_b",
759
+ "route_to_b",
760
+ "route_to_b",
761
+ "route_to_b",
762
+ "route_to_b",
763
+ "route_to_b",
764
+ "route_to_b",
765
+ "route_to_c",
766
+ "route_to_c",
767
+ "route_to_c",
768
+ "route_to_c",
769
+ "route_to_c",
770
+ "shed_load"
771
+ ],
772
+ "rewards": [
773
+ 0.9545454545454546,
774
+ 0.9545454545454546,
775
+ 0.9545454545454546,
776
+ 0.9545454545454546,
777
+ -2.0454545454545454,
778
+ 0.7727272727272727,
779
+ 0.7727272727272727,
780
+ 0.7727272727272727,
781
+ 0.7727272727272727,
782
+ 0.7727272727272727,
783
+ 0.7727272727272727,
784
+ 0.7727272727272727,
785
+ 0.7727272727272727,
786
+ -2.2272727272727275,
787
+ 0.5454545454545454,
788
+ 0.5454545454545454,
789
+ 0.5454545454545454,
790
+ 0.5454545454545454,
791
+ 0.5454545454545454,
792
+ -0.5
793
+ ]
794
+ },
795
+ {
796
+ "task": "hard_multi",
797
+ "seed": 3,
798
+ "policy": "llm",
799
+ "total_reward": 9.0455,
800
+ "grader_score": 0.7388,
801
+ "success_score": 0.9,
802
+ "budget_score": 0.0091,
803
+ "adaptation_score": 0.8944,
804
+ "latency_score": 0.6926,
805
+ "sla_score": 1.0,
806
+ "success_rate": 0.9,
807
+ "steps": 20,
808
+ "actions": [
809
+ "route_to_a",
810
+ "route_to_a",
811
+ "route_to_a",
812
+ "route_to_a",
813
+ "route_to_b",
814
+ "route_to_b",
815
+ "route_to_b",
816
+ "route_to_b",
817
+ "route_to_b",
818
+ "route_to_b",
819
+ "route_to_b",
820
+ "route_to_b",
821
+ "route_to_b",
822
+ "route_to_b",
823
+ "route_to_b",
824
+ "route_to_c",
825
+ "route_to_c",
826
+ "route_to_c",
827
+ "route_to_c",
828
+ "route_to_c"
829
+ ],
830
+ "rewards": [
831
+ 0.9545454545454546,
832
+ 0.9545454545454546,
833
+ 0.9545454545454546,
834
+ -2.0454545454545454,
835
+ 0.7727272727272727,
836
+ 0.7727272727272727,
837
+ 0.7727272727272727,
838
+ 0.7727272727272727,
839
+ 0.7727272727272727,
840
+ 0.7727272727272727,
841
+ 0.7727272727272727,
842
+ 0.7727272727272727,
843
+ 0.7727272727272727,
844
+ 0.7727272727272727,
845
+ -2.2272727272727275,
846
+ 0.5454545454545454,
847
+ 0.5454545454545454,
848
+ 0.5454545454545454,
849
+ 0.5454545454545454,
850
+ 0.5454545454545454
851
+ ]
852
+ },
853
+ {
854
+ "task": "hard_multi",
855
+ "seed": 4,
856
+ "policy": "llm",
857
+ "total_reward": -1.1364,
858
+ "grader_score": 0.624,
859
+ "success_score": 0.65,
860
+ "budget_score": 0.0727,
861
+ "adaptation_score": 0.75,
862
+ "latency_score": 0.5904,
863
+ "sla_score": 1.0,
864
+ "success_rate": 0.7647,
865
+ "steps": 20,
866
+ "actions": [
867
+ "route_to_a",
868
+ "route_to_a",
869
+ "route_to_b",
870
+ "route_to_b",
871
+ "route_to_b",
872
+ "route_to_b",
873
+ "route_to_b",
874
+ "route_to_b",
875
+ "route_to_b",
876
+ "route_to_b",
877
+ "route_to_b",
878
+ "route_to_b",
879
+ "route_to_c",
880
+ "route_to_c",
881
+ "route_to_c",
882
+ "route_to_c",
883
+ "route_to_c",
884
+ "shed_load",
885
+ "shed_load",
886
+ "shed_load"
887
+ ],
888
+ "rewards": [
889
+ 0.9545454545454546,
890
+ -2.0454545454545454,
891
+ 0.7727272727272727,
892
+ 0.7727272727272727,
893
+ -2.2272727272727275,
894
+ 0.7727272727272727,
895
+ 0.7727272727272727,
896
+ 0.7727272727272727,
897
+ 0.7727272727272727,
898
+ -2.2272727272727275,
899
+ 0.7727272727272727,
900
+ -2.2272727272727275,
901
+ 0.5454545454545454,
902
+ 0.5454545454545454,
903
+ 0.5454545454545454,
904
+ 0.5454545454545454,
905
+ 0.5454545454545454,
906
+ -0.5,
907
+ -0.5,
908
+ -0.5
909
+ ]
910
+ },
911
+ {
912
+ "task": "hard_multi",
913
+ "seed": 5,
914
+ "policy": "llm",
915
+ "total_reward": 1.9091,
916
+ "grader_score": 0.6665,
917
+ "success_score": 0.65,
918
+ "budget_score": 0.0818,
919
+ "adaptation_score": 0.9375,
920
+ "latency_score": 0.6085,
921
+ "sla_score": 1.0,
922
+ "success_rate": 0.8667,
923
+ "steps": 20,
924
+ "actions": [
925
+ "route_to_a",
926
+ "route_to_b",
927
+ "route_to_b",
928
+ "route_to_b",
929
+ "route_to_b",
930
+ "route_to_b",
931
+ "route_to_b",
932
+ "route_to_b",
933
+ "route_to_b",
934
+ "route_to_c",
935
+ "route_to_c",
936
+ "route_to_c",
937
+ "route_to_c",
938
+ "route_to_c",
939
+ "route_to_c",
940
+ "shed_load",
941
+ "shed_load",
942
+ "shed_load",
943
+ "shed_load",
944
+ "shed_load"
945
+ ],
946
+ "rewards": [
947
+ -2.0454545454545454,
948
+ 0.7727272727272727,
949
+ 0.7727272727272727,
950
+ 0.7727272727272727,
951
+ 0.7727272727272727,
952
+ 0.7727272727272727,
953
+ 0.7727272727272727,
954
+ 0.7727272727272727,
955
+ -2.2272727272727275,
956
+ 0.5454545454545454,
957
+ 0.5454545454545454,
958
+ 0.5454545454545454,
959
+ 0.5454545454545454,
960
+ 0.5454545454545454,
961
+ 0.5454545454545454,
962
+ -0.5,
963
+ -0.5,
964
+ -0.5,
965
+ -0.5,
966
+ -0.5
967
+ ]
968
+ },
969
+ {
970
+ "task": "hard_multi",
971
+ "seed": 6,
972
+ "policy": "llm",
973
+ "total_reward": 9.3182,
974
+ "grader_score": 0.7535,
975
+ "success_score": 0.9,
976
+ "budget_score": 0.0636,
977
+ "adaptation_score": 0.9444,
978
+ "latency_score": 0.6755,
979
+ "sla_score": 1.0,
980
+ "success_rate": 0.9,
981
+ "steps": 20,
982
+ "actions": [
983
+ "route_to_a",
984
+ "route_to_a",
985
+ "route_to_a",
986
+ "route_to_b",
987
+ "route_to_b",
988
+ "route_to_b",
989
+ "route_to_b",
990
+ "route_to_b",
991
+ "route_to_b",
992
+ "route_to_b",
993
+ "route_to_b",
994
+ "route_to_b",
995
+ "route_to_b",
996
+ "route_to_b",
997
+ "route_to_b",
998
+ "route_to_b",
999
+ "route_to_b",
1000
+ "route_to_c",
1001
+ "route_to_c",
1002
+ "route_to_c"
1003
+ ],
1004
+ "rewards": [
1005
+ 0.9545454545454546,
1006
+ 0.9545454545454546,
1007
+ -2.0454545454545454,
1008
+ 0.7727272727272727,
1009
+ 0.7727272727272727,
1010
+ 0.7727272727272727,
1011
+ 0.7727272727272727,
1012
+ 0.7727272727272727,
1013
+ 0.7727272727272727,
1014
+ 0.7727272727272727,
1015
+ 0.7727272727272727,
1016
+ 0.7727272727272727,
1017
+ 0.7727272727272727,
1018
+ 0.7727272727272727,
1019
+ 0.7727272727272727,
1020
+ 0.7727272727272727,
1021
+ -2.2272727272727275,
1022
+ 0.5454545454545454,
1023
+ 0.5454545454545454,
1024
+ 0.5454545454545454
1025
+ ]
1026
+ },
1027
+ {
1028
+ "task": "hard_multi",
1029
+ "seed": 7,
1030
+ "policy": "llm",
1031
+ "total_reward": 3.1818,
1032
+ "grader_score": 0.673,
1033
+ "success_score": 0.7,
1034
+ "budget_score": 0.0364,
1035
+ "adaptation_score": 0.9375,
1036
+ "latency_score": 0.6004,
1037
+ "sla_score": 1.0,
1038
+ "success_rate": 0.875,
1039
+ "steps": 20,
1040
+ "actions": [
1041
+ "route_to_a",
1042
+ "route_to_b",
1043
+ "route_to_b",
1044
+ "route_to_b",
1045
+ "route_to_b",
1046
+ "route_to_b",
1047
+ "route_to_b",
1048
+ "route_to_b",
1049
+ "route_to_b",
1050
+ "route_to_b",
1051
+ "route_to_c",
1052
+ "route_to_c",
1053
+ "route_to_c",
1054
+ "route_to_c",
1055
+ "route_to_c",
1056
+ "route_to_c",
1057
+ "shed_load",
1058
+ "shed_load",
1059
+ "shed_load",
1060
+ "shed_load"
1061
+ ],
1062
+ "rewards": [
1063
+ -2.0454545454545454,
1064
+ 0.7727272727272727,
1065
+ 0.7727272727272727,
1066
+ 0.7727272727272727,
1067
+ 0.7727272727272727,
1068
+ 0.7727272727272727,
1069
+ 0.7727272727272727,
1070
+ 0.7727272727272727,
1071
+ 0.7727272727272727,
1072
+ -2.2272727272727275,
1073
+ 0.5454545454545454,
1074
+ 0.5454545454545454,
1075
+ 0.5454545454545454,
1076
+ 0.5454545454545454,
1077
+ 0.5454545454545454,
1078
+ 0.5454545454545454,
1079
+ -0.5,
1080
+ -0.5,
1081
+ -0.5,
1082
+ -0.5
1083
+ ]
1084
+ },
1085
+ {
1086
+ "task": "hard_multi",
1087
+ "seed": 8,
1088
+ "policy": "llm",
1089
+ "total_reward": 3.3636,
1090
+ "grader_score": 0.6573,
1091
+ "success_score": 0.7,
1092
+ "budget_score": 0.0727,
1093
+ "adaptation_score": 0.8661,
1094
+ "latency_score": 0.5661,
1095
+ "sla_score": 1.0,
1096
+ "success_rate": 0.875,
1097
+ "steps": 20,
1098
+ "actions": [
1099
+ "route_to_a",
1100
+ "route_to_a",
1101
+ "route_to_b",
1102
+ "route_to_b",
1103
+ "route_to_b",
1104
+ "route_to_b",
1105
+ "route_to_b",
1106
+ "route_to_b",
1107
+ "route_to_b",
1108
+ "route_to_b",
1109
+ "route_to_c",
1110
+ "route_to_c",
1111
+ "route_to_c",
1112
+ "route_to_c",
1113
+ "route_to_c",
1114
+ "route_to_c",
1115
+ "shed_load",
1116
+ "shed_load",
1117
+ "shed_load",
1118
+ "shed_load"
1119
+ ],
1120
+ "rewards": [
1121
+ 0.9545454545454546,
1122
+ -2.0454545454545454,
1123
+ 0.7727272727272727,
1124
+ 0.7727272727272727,
1125
+ 0.7727272727272727,
1126
+ 0.7727272727272727,
1127
+ 0.7727272727272727,
1128
+ 0.7727272727272727,
1129
+ 0.7727272727272727,
1130
+ -2.2272727272727275,
1131
+ 0.5454545454545454,
1132
+ 0.5454545454545454,
1133
+ 0.5454545454545454,
1134
+ 0.5454545454545454,
1135
+ 0.5454545454545454,
1136
+ 0.5454545454545454,
1137
+ -0.5,
1138
+ -0.5,
1139
+ -0.5,
1140
+ -0.5
1141
+ ]
1142
+ },
1143
+ {
1144
+ "task": "hard_multi",
1145
+ "seed": 9,
1146
+ "policy": "llm",
1147
+ "total_reward": 3.4091,
1148
+ "grader_score": 0.6783,
1149
+ "success_score": 0.7,
1150
+ "budget_score": 0.0818,
1151
+ "adaptation_score": 0.95,
1152
+ "latency_score": 0.5803,
1153
+ "sla_score": 1.0,
1154
+ "success_rate": 0.875,
1155
+ "steps": 20,
1156
+ "actions": [
1157
+ "route_to_a",
1158
+ "route_to_b",
1159
+ "route_to_b",
1160
+ "route_to_b",
1161
+ "route_to_b",
1162
+ "route_to_b",
1163
+ "route_to_b",
1164
+ "route_to_b",
1165
+ "route_to_b",
1166
+ "route_to_b",
1167
+ "route_to_b",
1168
+ "route_to_c",
1169
+ "route_to_c",
1170
+ "route_to_c",
1171
+ "route_to_c",
1172
+ "route_to_c",
1173
+ "shed_load",
1174
+ "shed_load",
1175
+ "shed_load",
1176
+ "shed_load"
1177
+ ],
1178
+ "rewards": [
1179
+ -2.0454545454545454,
1180
+ 0.7727272727272727,
1181
+ 0.7727272727272727,
1182
+ 0.7727272727272727,
1183
+ 0.7727272727272727,
1184
+ 0.7727272727272727,
1185
+ 0.7727272727272727,
1186
+ 0.7727272727272727,
1187
+ 0.7727272727272727,
1188
+ 0.7727272727272727,
1189
+ -2.2272727272727275,
1190
+ 0.5454545454545454,
1191
+ 0.5454545454545454,
1192
+ 0.5454545454545454,
1193
+ 0.5454545454545454,
1194
+ 0.5454545454545454,
1195
+ -0.5,
1196
+ -0.5,
1197
+ -0.5,
1198
+ -0.5
1199
+ ]
1200
+ }
1201
+ ]
1202
+ }
eval/outputs/prompt_audit/budget_guard_dev10/eval_summary_20260425_164343.md ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # Budget Router Evaluation — 20260425_164343
2
+
3
+ | Task | HEURISTIC Grader | LLM Grader | Notes |
4
+ |---|---|---|---|
5
+ | Hard_Multi | 0.6078 (n=10) | 0.6888 (n=10) | LLM +8.1 points vs heuristic |
eval/outputs/prompt_audit/budget_guard_heldout5/eval_results_20260425_163956.json ADDED
@@ -0,0 +1,617 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "timestamp": "20260425_163956",
4
+ "policies": [
5
+ "heuristic",
6
+ "llm"
7
+ ],
8
+ "tasks": [
9
+ "hard_multi"
10
+ ],
11
+ "seeds": [
12
+ 100,
13
+ 101,
14
+ 102,
15
+ 103,
16
+ 104
17
+ ]
18
+ },
19
+ "summary": {
20
+ "hard_multi|heuristic": {
21
+ "grader_mean": 0.6175,
22
+ "reward_mean": -2.1399,
23
+ "success_rate": 0.7108,
24
+ "adaptation": 0.7001,
25
+ "n": 5
26
+ },
27
+ "hard_multi|llm": {
28
+ "grader_mean": 0.6577,
29
+ "reward_mean": 2.3818,
30
+ "success_rate": 0.8196,
31
+ "adaptation": 0.8216,
32
+ "n": 5
33
+ }
34
+ },
35
+ "episodes": [
36
+ {
37
+ "task": "hard_multi",
38
+ "seed": 100,
39
+ "policy": "heuristic",
40
+ "total_reward": -7.0629,
41
+ "grader_score": 0.5459,
42
+ "success_score": 0.6,
43
+ "budget_score": 0.0909,
44
+ "adaptation_score": 0.6111,
45
+ "latency_score": 0.4399,
46
+ "sla_score": 0.9474,
47
+ "success_rate": 0.6316,
48
+ "steps": 20,
49
+ "actions": [
50
+ "route_to_a",
51
+ "route_to_a",
52
+ "route_to_a",
53
+ "route_to_a",
54
+ "route_to_a",
55
+ "route_to_b",
56
+ "route_to_b",
57
+ "route_to_b",
58
+ "route_to_b",
59
+ "route_to_b",
60
+ "route_to_b",
61
+ "route_to_b",
62
+ "route_to_b",
63
+ "route_to_b",
64
+ "route_to_c",
65
+ "route_to_c",
66
+ "route_to_c",
67
+ "route_to_c",
68
+ "route_to_c",
69
+ "shed_load"
70
+ ],
71
+ "rewards": [
72
+ 0.9545454545454546,
73
+ -2.0454545454545454,
74
+ 0.9545454545454546,
75
+ -2.0454545454545454,
76
+ -2.0454545454545454,
77
+ 0.7727272727272727,
78
+ -2.2272727272727275,
79
+ 0.7727272727272727,
80
+ 0.7727272727272727,
81
+ 0.7727272727272727,
82
+ -2.2272727272727275,
83
+ 0.7727272727272727,
84
+ -2.2272727272727275,
85
+ -2.2447259319640143,
86
+ 0.5454545454545454,
87
+ 0.5454545454545454,
88
+ 0.5454545454545454,
89
+ 0.5454545454545454,
90
+ 0.5454545454545454,
91
+ -0.5
92
+ ]
93
+ },
94
+ {
95
+ "task": "hard_multi",
96
+ "seed": 101,
97
+ "policy": "heuristic",
98
+ "total_reward": 3.4091,
99
+ "grader_score": 0.6753,
100
+ "success_score": 0.8,
101
+ "budget_score": 0.0818,
102
+ "adaptation_score": 0.7857,
103
+ "latency_score": 0.5795,
104
+ "sla_score": 1.0,
105
+ "success_rate": 0.8,
106
+ "steps": 20,
107
+ "actions": [
108
+ "route_to_a",
109
+ "route_to_b",
110
+ "route_to_b",
111
+ "route_to_b",
112
+ "route_to_b",
113
+ "route_to_b",
114
+ "route_to_b",
115
+ "route_to_b",
116
+ "route_to_b",
117
+ "route_to_b",
118
+ "route_to_b",
119
+ "route_to_b",
120
+ "route_to_b",
121
+ "route_to_b",
122
+ "route_to_b",
123
+ "route_to_b",
124
+ "route_to_b",
125
+ "route_to_b",
126
+ "route_to_b",
127
+ "route_to_c"
128
+ ],
129
+ "rewards": [
130
+ -2.0454545454545454,
131
+ 0.7727272727272727,
132
+ 0.7727272727272727,
133
+ 0.7727272727272727,
134
+ 0.7727272727272727,
135
+ 0.7727272727272727,
136
+ 0.7727272727272727,
137
+ 0.7727272727272727,
138
+ 0.7727272727272727,
139
+ 0.7727272727272727,
140
+ 0.7727272727272727,
141
+ 0.7727272727272727,
142
+ 0.7727272727272727,
143
+ 0.7727272727272727,
144
+ -2.2272727272727275,
145
+ 0.7727272727272727,
146
+ -2.2272727272727275,
147
+ 0.7727272727272727,
148
+ -2.2272727272727275,
149
+ 0.5454545454545454
150
+ ]
151
+ },
152
+ {
153
+ "task": "hard_multi",
154
+ "seed": 102,
155
+ "policy": "heuristic",
156
+ "total_reward": -2.5909,
157
+ "grader_score": 0.6228,
158
+ "success_score": 0.7,
159
+ "budget_score": 0.0818,
160
+ "adaptation_score": 0.6932,
161
+ "latency_score": 0.5593,
162
+ "sla_score": 1.0,
163
+ "success_rate": 0.7,
164
+ "steps": 20,
165
+ "actions": [
166
+ "route_to_a",
167
+ "route_to_a",
168
+ "route_to_a",
169
+ "route_to_a",
170
+ "route_to_a",
171
+ "route_to_a",
172
+ "route_to_b",
173
+ "route_to_b",
174
+ "route_to_b",
175
+ "route_to_b",
176
+ "route_to_b",
177
+ "route_to_b",
178
+ "route_to_b",
179
+ "route_to_b",
180
+ "route_to_b",
181
+ "route_to_c",
182
+ "route_to_c",
183
+ "route_to_c",
184
+ "route_to_c",
185
+ "route_to_c"
186
+ ],
187
+ "rewards": [
188
+ 0.9545454545454546,
189
+ 0.9545454545454546,
190
+ -2.0454545454545454,
191
+ -2.0454545454545454,
192
+ 0.9545454545454546,
193
+ -2.0454545454545454,
194
+ 0.7727272727272727,
195
+ 0.7727272727272727,
196
+ 0.7727272727272727,
197
+ 0.7727272727272727,
198
+ 0.7727272727272727,
199
+ -2.2272727272727275,
200
+ 0.7727272727272727,
201
+ -2.2272727272727275,
202
+ -2.2272727272727275,
203
+ 0.5454545454545454,
204
+ 0.5454545454545454,
205
+ 0.5454545454545454,
206
+ 0.5454545454545454,
207
+ 0.5454545454545454
208
+ ]
209
+ },
210
+ {
211
+ "task": "hard_multi",
212
+ "seed": 103,
213
+ "policy": "heuristic",
214
+ "total_reward": -2.8182,
215
+ "grader_score": 0.6003,
216
+ "success_score": 0.65,
217
+ "budget_score": 0.0364,
218
+ "adaptation_score": 0.75,
219
+ "latency_score": 0.4991,
220
+ "sla_score": 1.0,
221
+ "success_rate": 0.7222,
222
+ "steps": 20,
223
+ "actions": [
224
+ "route_to_a",
225
+ "route_to_b",
226
+ "route_to_b",
227
+ "route_to_b",
228
+ "route_to_b",
229
+ "route_to_b",
230
+ "route_to_b",
231
+ "route_to_b",
232
+ "route_to_b",
233
+ "route_to_b",
234
+ "route_to_b",
235
+ "route_to_b",
236
+ "route_to_b",
237
+ "route_to_b",
238
+ "route_to_c",
239
+ "route_to_c",
240
+ "route_to_c",
241
+ "route_to_c",
242
+ "shed_load",
243
+ "shed_load"
244
+ ],
245
+ "rewards": [
246
+ -2.0454545454545454,
247
+ 0.7727272727272727,
248
+ 0.7727272727272727,
249
+ 0.7727272727272727,
250
+ 0.7727272727272727,
251
+ -2.2272727272727275,
252
+ 0.7727272727272727,
253
+ 0.7727272727272727,
254
+ 0.7727272727272727,
255
+ 0.7727272727272727,
256
+ -2.2272727272727275,
257
+ -2.2272727272727275,
258
+ 0.7727272727272727,
259
+ -2.2272727272727275,
260
+ 0.5454545454545454,
261
+ 0.5454545454545454,
262
+ 0.5454545454545454,
263
+ 0.5454545454545454,
264
+ -0.5,
265
+ -0.5
266
+ ]
267
+ },
268
+ {
269
+ "task": "hard_multi",
270
+ "seed": 104,
271
+ "policy": "heuristic",
272
+ "total_reward": -1.6364,
273
+ "grader_score": 0.6432,
274
+ "success_score": 0.7,
275
+ "budget_score": 0.2727,
276
+ "adaptation_score": 0.6607,
277
+ "latency_score": 0.5509,
278
+ "sla_score": 1.0,
279
+ "success_rate": 0.7,
280
+ "steps": 20,
281
+ "actions": [
282
+ "route_to_a",
283
+ "route_to_a",
284
+ "route_to_a",
285
+ "route_to_a",
286
+ "route_to_a",
287
+ "route_to_b",
288
+ "route_to_b",
289
+ "route_to_b",
290
+ "route_to_b",
291
+ "route_to_b",
292
+ "route_to_b",
293
+ "route_to_b",
294
+ "route_to_b",
295
+ "route_to_b",
296
+ "route_to_b",
297
+ "route_to_b",
298
+ "route_to_b",
299
+ "route_to_b",
300
+ "route_to_b",
301
+ "route_to_b"
302
+ ],
303
+ "rewards": [
304
+ 0.9545454545454546,
305
+ -2.0454545454545454,
306
+ 0.9545454545454546,
307
+ -2.0454545454545454,
308
+ -2.0454545454545454,
309
+ 0.7727272727272727,
310
+ 0.7727272727272727,
311
+ 0.7727272727272727,
312
+ 0.7727272727272727,
313
+ 0.7727272727272727,
314
+ 0.7727272727272727,
315
+ 0.7727272727272727,
316
+ 0.7727272727272727,
317
+ -2.2272727272727275,
318
+ 0.7727272727272727,
319
+ 0.7727272727272727,
320
+ 0.7727272727272727,
321
+ -2.2272727272727275,
322
+ 0.7727272727272727,
323
+ -2.2272727272727275
324
+ ]
325
+ },
326
+ {
327
+ "task": "hard_multi",
328
+ "seed": 100,
329
+ "policy": "llm",
330
+ "total_reward": -5.6364,
331
+ "grader_score": 0.5687,
332
+ "success_score": 0.6,
333
+ "budget_score": 0.0727,
334
+ "adaptation_score": 0.6458,
335
+ "latency_score": 0.493,
336
+ "sla_score": 1.0,
337
+ "success_rate": 0.6667,
338
+ "steps": 20,
339
+ "actions": [
340
+ "route_to_a",
341
+ "route_to_a",
342
+ "route_to_b",
343
+ "route_to_b",
344
+ "route_to_b",
345
+ "route_to_b",
346
+ "route_to_b",
347
+ "route_to_b",
348
+ "route_to_b",
349
+ "route_to_b",
350
+ "route_to_b",
351
+ "route_to_b",
352
+ "route_to_b",
353
+ "route_to_b",
354
+ "route_to_c",
355
+ "route_to_c",
356
+ "route_to_c",
357
+ "route_to_c",
358
+ "shed_load",
359
+ "shed_load"
360
+ ],
361
+ "rewards": [
362
+ 0.9545454545454546,
363
+ -2.0454545454545454,
364
+ 0.7727272727272727,
365
+ -2.2272727272727275,
366
+ 0.7727272727272727,
367
+ 0.7727272727272727,
368
+ -2.2272727272727275,
369
+ 0.7727272727272727,
370
+ 0.7727272727272727,
371
+ 0.7727272727272727,
372
+ -2.2272727272727275,
373
+ 0.7727272727272727,
374
+ -2.2272727272727275,
375
+ -2.2272727272727275,
376
+ 0.5454545454545454,
377
+ 0.5454545454545454,
378
+ 0.5454545454545454,
379
+ 0.5454545454545454,
380
+ -0.5,
381
+ -0.5
382
+ ]
383
+ },
384
+ {
385
+ "task": "hard_multi",
386
+ "seed": 101,
387
+ "policy": "llm",
388
+ "total_reward": 6.4091,
389
+ "grader_score": 0.7038,
390
+ "success_score": 0.8,
391
+ "budget_score": 0.0818,
392
+ "adaptation_score": 0.9,
393
+ "latency_score": 0.6076,
394
+ "sla_score": 1.0,
395
+ "success_rate": 0.8889,
396
+ "steps": 20,
397
+ "actions": [
398
+ "route_to_a",
399
+ "route_to_b",
400
+ "route_to_b",
401
+ "route_to_b",
402
+ "route_to_b",
403
+ "route_to_b",
404
+ "route_to_b",
405
+ "route_to_b",
406
+ "route_to_b",
407
+ "route_to_b",
408
+ "route_to_b",
409
+ "route_to_b",
410
+ "route_to_b",
411
+ "route_to_b",
412
+ "route_to_b",
413
+ "route_to_c",
414
+ "route_to_c",
415
+ "route_to_c",
416
+ "shed_load",
417
+ "shed_load"
418
+ ],
419
+ "rewards": [
420
+ -2.0454545454545454,
421
+ 0.7727272727272727,
422
+ 0.7727272727272727,
423
+ 0.7727272727272727,
424
+ 0.7727272727272727,
425
+ 0.7727272727272727,
426
+ 0.7727272727272727,
427
+ 0.7727272727272727,
428
+ 0.7727272727272727,
429
+ 0.7727272727272727,
430
+ 0.7727272727272727,
431
+ 0.7727272727272727,
432
+ 0.7727272727272727,
433
+ 0.7727272727272727,
434
+ -2.2272727272727275,
435
+ 0.5454545454545454,
436
+ 0.5454545454545454,
437
+ 0.5454545454545454,
438
+ -0.5,
439
+ -0.5
440
+ ]
441
+ },
442
+ {
443
+ "task": "hard_multi",
444
+ "seed": 102,
445
+ "policy": "llm",
446
+ "total_reward": 6.0909,
447
+ "grader_score": 0.707,
448
+ "success_score": 0.8,
449
+ "budget_score": 0.0182,
450
+ "adaptation_score": 0.9091,
451
+ "latency_score": 0.6621,
452
+ "sla_score": 1.0,
453
+ "success_rate": 0.8889,
454
+ "steps": 20,
455
+ "actions": [
456
+ "route_to_a",
457
+ "route_to_a",
458
+ "route_to_a",
459
+ "route_to_b",
460
+ "route_to_b",
461
+ "route_to_b",
462
+ "route_to_b",
463
+ "route_to_b",
464
+ "route_to_b",
465
+ "route_to_b",
466
+ "route_to_b",
467
+ "route_to_b",
468
+ "route_to_c",
469
+ "route_to_c",
470
+ "route_to_c",
471
+ "route_to_c",
472
+ "route_to_c",
473
+ "route_to_c",
474
+ "shed_load",
475
+ "shed_load"
476
+ ],
477
+ "rewards": [
478
+ 0.9545454545454546,
479
+ 0.9545454545454546,
480
+ -2.0454545454545454,
481
+ 0.7727272727272727,
482
+ 0.7727272727272727,
483
+ 0.7727272727272727,
484
+ 0.7727272727272727,
485
+ 0.7727272727272727,
486
+ 0.7727272727272727,
487
+ 0.7727272727272727,
488
+ 0.7727272727272727,
489
+ -2.2272727272727275,
490
+ 0.5454545454545454,
491
+ 0.5454545454545454,
492
+ 0.5454545454545454,
493
+ 0.5454545454545454,
494
+ 0.5454545454545454,
495
+ 0.5454545454545454,
496
+ -0.5,
497
+ -0.5
498
+ ]
499
+ },
500
+ {
501
+ "task": "hard_multi",
502
+ "seed": 103,
503
+ "policy": "llm",
504
+ "total_reward": -1.3182,
505
+ "grader_score": 0.6135,
506
+ "success_score": 0.65,
507
+ "budget_score": 0.0364,
508
+ "adaptation_score": 0.7946,
509
+ "latency_score": 0.5208,
510
+ "sla_score": 1.0,
511
+ "success_rate": 0.7647,
512
+ "steps": 20,
513
+ "actions": [
514
+ "route_to_a",
515
+ "route_to_b",
516
+ "route_to_b",
517
+ "route_to_b",
518
+ "route_to_b",
519
+ "route_to_b",
520
+ "route_to_b",
521
+ "route_to_b",
522
+ "route_to_b",
523
+ "route_to_b",
524
+ "route_to_b",
525
+ "route_to_b",
526
+ "route_to_c",
527
+ "route_to_c",
528
+ "route_to_c",
529
+ "route_to_c",
530
+ "route_to_c",
531
+ "shed_load",
532
+ "shed_load",
533
+ "shed_load"
534
+ ],
535
+ "rewards": [
536
+ -2.0454545454545454,
537
+ 0.7727272727272727,
538
+ 0.7727272727272727,
539
+ 0.7727272727272727,
540
+ 0.7727272727272727,
541
+ -2.2272727272727275,
542
+ 0.7727272727272727,
543
+ 0.7727272727272727,
544
+ 0.7727272727272727,
545
+ 0.7727272727272727,
546
+ -2.2272727272727275,
547
+ -2.2272727272727275,
548
+ 0.5454545454545454,
549
+ 0.5454545454545454,
550
+ 0.5454545454545454,
551
+ 0.5454545454545454,
552
+ 0.5454545454545454,
553
+ -0.5,
554
+ -0.5,
555
+ -0.5
556
+ ]
557
+ },
558
+ {
559
+ "task": "hard_multi",
560
+ "seed": 104,
561
+ "policy": "llm",
562
+ "total_reward": 6.3636,
563
+ "grader_score": 0.6953,
564
+ "success_score": 0.8,
565
+ "budget_score": 0.0727,
566
+ "adaptation_score": 0.8583,
567
+ "latency_score": 0.6138,
568
+ "sla_score": 1.0,
569
+ "success_rate": 0.8889,
570
+ "steps": 20,
571
+ "actions": [
572
+ "route_to_a",
573
+ "route_to_a",
574
+ "route_to_b",
575
+ "route_to_b",
576
+ "route_to_b",
577
+ "route_to_b",
578
+ "route_to_b",
579
+ "route_to_b",
580
+ "route_to_b",
581
+ "route_to_b",
582
+ "route_to_b",
583
+ "route_to_b",
584
+ "route_to_b",
585
+ "route_to_b",
586
+ "route_to_c",
587
+ "route_to_c",
588
+ "route_to_c",
589
+ "route_to_c",
590
+ "shed_load",
591
+ "shed_load"
592
+ ],
593
+ "rewards": [
594
+ 0.9545454545454546,
595
+ -2.0454545454545454,
596
+ 0.7727272727272727,
597
+ 0.7727272727272727,
598
+ 0.7727272727272727,
599
+ 0.7727272727272727,
600
+ 0.7727272727272727,
601
+ 0.7727272727272727,
602
+ 0.7727272727272727,
603
+ 0.7727272727272727,
604
+ 0.7727272727272727,
605
+ 0.7727272727272727,
606
+ 0.7727272727272727,
607
+ -2.2272727272727275,
608
+ 0.5454545454545454,
609
+ 0.5454545454545454,
610
+ 0.5454545454545454,
611
+ 0.5454545454545454,
612
+ -0.5,
613
+ -0.5
614
+ ]
615
+ }
616
+ ]
617
+ }
eval/outputs/prompt_audit/budget_guard_heldout5/eval_summary_20260425_163956.md ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # Budget Router Evaluation — 20260425_163956
2
+
3
+ | Task | HEURISTIC Grader | LLM Grader | Notes |
4
+ |---|---|---|---|
5
+ | Hard_Multi | 0.6175 (n=5) | 0.6577 (n=5) | LLM +4.0 points vs heuristic |
eval/outputs/trace_compare/eval_seed101/eval_results_20260425_192545.json ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "timestamp": "20260425_192545",
4
+ "policies": [
5
+ "heuristic",
6
+ "llm"
7
+ ],
8
+ "tasks": [
9
+ "hard_multi"
10
+ ],
11
+ "seeds": [
12
+ 101
13
+ ]
14
+ },
15
+ "summary": {
16
+ "hard_multi|heuristic": {
17
+ "grader_mean": 0.6753,
18
+ "reward_mean": 3.4091,
19
+ "success_rate": 0.8,
20
+ "adaptation": 0.7857,
21
+ "n": 1
22
+ },
23
+ "hard_multi|llm": {
24
+ "grader_mean": 0.7038,
25
+ "reward_mean": 6.4091,
26
+ "success_rate": 0.8889,
27
+ "adaptation": 0.9,
28
+ "n": 1
29
+ }
30
+ },
31
+ "episodes": [
32
+ {
33
+ "task": "hard_multi",
34
+ "seed": 101,
35
+ "policy": "heuristic",
36
+ "total_reward": 3.4091,
37
+ "grader_score": 0.6753,
38
+ "success_score": 0.8,
39
+ "budget_score": 0.0818,
40
+ "adaptation_score": 0.7857,
41
+ "latency_score": 0.5795,
42
+ "sla_score": 1.0,
43
+ "success_rate": 0.8,
44
+ "steps": 20,
45
+ "actions": [
46
+ "route_to_a",
47
+ "route_to_b",
48
+ "route_to_b",
49
+ "route_to_b",
50
+ "route_to_b",
51
+ "route_to_b",
52
+ "route_to_b",
53
+ "route_to_b",
54
+ "route_to_b",
55
+ "route_to_b",
56
+ "route_to_b",
57
+ "route_to_b",
58
+ "route_to_b",
59
+ "route_to_b",
60
+ "route_to_b",
61
+ "route_to_b",
62
+ "route_to_b",
63
+ "route_to_b",
64
+ "route_to_b",
65
+ "route_to_c"
66
+ ],
67
+ "rewards": [
68
+ -2.0454545454545454,
69
+ 0.7727272727272727,
70
+ 0.7727272727272727,
71
+ 0.7727272727272727,
72
+ 0.7727272727272727,
73
+ 0.7727272727272727,
74
+ 0.7727272727272727,
75
+ 0.7727272727272727,
76
+ 0.7727272727272727,
77
+ 0.7727272727272727,
78
+ 0.7727272727272727,
79
+ 0.7727272727272727,
80
+ 0.7727272727272727,
81
+ 0.7727272727272727,
82
+ -2.2272727272727275,
83
+ 0.7727272727272727,
84
+ -2.2272727272727275,
85
+ 0.7727272727272727,
86
+ -2.2272727272727275,
87
+ 0.5454545454545454
88
+ ]
89
+ },
90
+ {
91
+ "task": "hard_multi",
92
+ "seed": 101,
93
+ "policy": "llm",
94
+ "total_reward": 6.4091,
95
+ "grader_score": 0.7038,
96
+ "success_score": 0.8,
97
+ "budget_score": 0.0818,
98
+ "adaptation_score": 0.9,
99
+ "latency_score": 0.6076,
100
+ "sla_score": 1.0,
101
+ "success_rate": 0.8889,
102
+ "steps": 20,
103
+ "actions": [
104
+ "route_to_a",
105
+ "route_to_b",
106
+ "route_to_b",
107
+ "route_to_b",
108
+ "route_to_b",
109
+ "route_to_b",
110
+ "route_to_b",
111
+ "route_to_b",
112
+ "route_to_b",
113
+ "route_to_b",
114
+ "route_to_b",
115
+ "route_to_b",
116
+ "route_to_b",
117
+ "route_to_b",
118
+ "route_to_b",
119
+ "route_to_c",
120
+ "route_to_c",
121
+ "route_to_c",
122
+ "shed_load",
123
+ "shed_load"
124
+ ],
125
+ "rewards": [
126
+ -2.0454545454545454,
127
+ 0.7727272727272727,
128
+ 0.7727272727272727,
129
+ 0.7727272727272727,
130
+ 0.7727272727272727,
131
+ 0.7727272727272727,
132
+ 0.7727272727272727,
133
+ 0.7727272727272727,
134
+ 0.7727272727272727,
135
+ 0.7727272727272727,
136
+ 0.7727272727272727,
137
+ 0.7727272727272727,
138
+ 0.7727272727272727,
139
+ 0.7727272727272727,
140
+ -2.2272727272727275,
141
+ 0.5454545454545454,
142
+ 0.5454545454545454,
143
+ 0.5454545454545454,
144
+ -0.5,
145
+ -0.5
146
+ ]
147
+ }
148
+ ]
149
+ }
eval/outputs/trace_compare/eval_seed101/eval_results_20260425_192656.json ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "timestamp": "20260425_192656",
4
+ "policies": [
5
+ "heuristic",
6
+ "llm"
7
+ ],
8
+ "tasks": [
9
+ "hard_multi"
10
+ ],
11
+ "seeds": [
12
+ 102
13
+ ]
14
+ },
15
+ "summary": {
16
+ "hard_multi|heuristic": {
17
+ "grader_mean": 0.6228,
18
+ "reward_mean": -2.5909,
19
+ "success_rate": 0.7,
20
+ "adaptation": 0.6932,
21
+ "n": 1
22
+ },
23
+ "hard_multi|llm": {
24
+ "grader_mean": 0.707,
25
+ "reward_mean": 6.0909,
26
+ "success_rate": 0.8889,
27
+ "adaptation": 0.9091,
28
+ "n": 1
29
+ }
30
+ },
31
+ "episodes": [
32
+ {
33
+ "task": "hard_multi",
34
+ "seed": 102,
35
+ "policy": "heuristic",
36
+ "total_reward": -2.5909,
37
+ "grader_score": 0.6228,
38
+ "success_score": 0.7,
39
+ "budget_score": 0.0818,
40
+ "adaptation_score": 0.6932,
41
+ "latency_score": 0.5593,
42
+ "sla_score": 1.0,
43
+ "success_rate": 0.7,
44
+ "steps": 20,
45
+ "actions": [
46
+ "route_to_a",
47
+ "route_to_a",
48
+ "route_to_a",
49
+ "route_to_a",
50
+ "route_to_a",
51
+ "route_to_a",
52
+ "route_to_b",
53
+ "route_to_b",
54
+ "route_to_b",
55
+ "route_to_b",
56
+ "route_to_b",
57
+ "route_to_b",
58
+ "route_to_b",
59
+ "route_to_b",
60
+ "route_to_b",
61
+ "route_to_c",
62
+ "route_to_c",
63
+ "route_to_c",
64
+ "route_to_c",
65
+ "route_to_c"
66
+ ],
67
+ "rewards": [
68
+ 0.9545454545454546,
69
+ 0.9545454545454546,
70
+ -2.0454545454545454,
71
+ -2.0454545454545454,
72
+ 0.9545454545454546,
73
+ -2.0454545454545454,
74
+ 0.7727272727272727,
75
+ 0.7727272727272727,
76
+ 0.7727272727272727,
77
+ 0.7727272727272727,
78
+ 0.7727272727272727,
79
+ -2.2272727272727275,
80
+ 0.7727272727272727,
81
+ -2.2272727272727275,
82
+ -2.2272727272727275,
83
+ 0.5454545454545454,
84
+ 0.5454545454545454,
85
+ 0.5454545454545454,
86
+ 0.5454545454545454,
87
+ 0.5454545454545454
88
+ ]
89
+ },
90
+ {
91
+ "task": "hard_multi",
92
+ "seed": 102,
93
+ "policy": "llm",
94
+ "total_reward": 6.0909,
95
+ "grader_score": 0.707,
96
+ "success_score": 0.8,
97
+ "budget_score": 0.0182,
98
+ "adaptation_score": 0.9091,
99
+ "latency_score": 0.6621,
100
+ "sla_score": 1.0,
101
+ "success_rate": 0.8889,
102
+ "steps": 20,
103
+ "actions": [
104
+ "route_to_a",
105
+ "route_to_a",
106
+ "route_to_a",
107
+ "route_to_b",
108
+ "route_to_b",
109
+ "route_to_b",
110
+ "route_to_b",
111
+ "route_to_b",
112
+ "route_to_b",
113
+ "route_to_b",
114
+ "route_to_b",
115
+ "route_to_b",
116
+ "route_to_c",
117
+ "route_to_c",
118
+ "route_to_c",
119
+ "route_to_c",
120
+ "route_to_c",
121
+ "route_to_c",
122
+ "shed_load",
123
+ "shed_load"
124
+ ],
125
+ "rewards": [
126
+ 0.9545454545454546,
127
+ 0.9545454545454546,
128
+ -2.0454545454545454,
129
+ 0.7727272727272727,
130
+ 0.7727272727272727,
131
+ 0.7727272727272727,
132
+ 0.7727272727272727,
133
+ 0.7727272727272727,
134
+ 0.7727272727272727,
135
+ 0.7727272727272727,
136
+ 0.7727272727272727,
137
+ -2.2272727272727275,
138
+ 0.5454545454545454,
139
+ 0.5454545454545454,
140
+ 0.5454545454545454,
141
+ 0.5454545454545454,
142
+ 0.5454545454545454,
143
+ 0.5454545454545454,
144
+ -0.5,
145
+ -0.5
146
+ ]
147
+ }
148
+ ]
149
+ }
eval/outputs/trace_compare/eval_seed101/eval_summary_20260425_192545.md ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # Budget Router Evaluation — 20260425_192545
2
+
3
+ | Task | HEURISTIC Grader | LLM Grader | Notes |
4
+ |---|---|---|---|
5
+ | Hard_Multi | 0.6753 (n=1) | 0.7038 (n=1) | LLM +2.8 points vs heuristic |
eval/outputs/trace_compare/eval_seed101/eval_summary_20260425_192656.md ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # Budget Router Evaluation — 20260425_192656
2
+
3
+ | Task | HEURISTIC Grader | LLM Grader | Notes |
4
+ |---|---|---|---|
5
+ | Hard_Multi | 0.6228 (n=1) | 0.7070 (n=1) | LLM +8.4 points vs heuristic |
eval/trace_episode.py ADDED
@@ -0,0 +1,357 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Trace one Budget Router episode for a chosen policy, task, and seed.
4
+
5
+ This is a debugging/evidence tool: it prints per-step actions, step rewards,
6
+ costs, success/failure, latency, cumulative reward, and final grader metrics.
7
+ It does not expose hidden provider health to the policy.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import json
13
+ import os
14
+ import sys
15
+ from pathlib import Path
16
+ from typing import Any, Dict, List, Optional
17
+
18
+ import typer
19
+
20
+ # Ensure imports work when run as `uv run python eval/trace_episode.py`.
21
+ sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
22
+
23
+ from budget_router.environment import BudgetRouterEnv
24
+ from budget_router.models import Action, ActionType, Observation, TaskConfig
25
+ from budget_router.policies import heuristic_baseline_policy
26
+ from budget_router.reward import episode_metrics, grade_episode
27
+ from budget_router.tasks import TASK_PRESETS
28
+ from inference import LLMRouter
29
+
30
+
31
+ app = typer.Typer(add_completion=False)
32
+
33
+ POLICIES = {"heuristic", "llm", "ppo"}
34
+ DEFAULT_PPO_MODELS = {
35
+ "easy": Path("trained_models/ppo_easy_50k.zip"),
36
+ "hard_multi": Path("trained_models/ppo_hard_multi_100k.zip"),
37
+ }
38
+ # Matches train/gym_wrapper.BudgetRouterGymEnv action order (Discrete 0..3).
39
+ _PPO_ACTION_NAMES = ("route_to_a", "route_to_b", "route_to_c", "shed_load")
40
+
41
+
42
+ def _echo_step_progress(
43
+ *,
44
+ policy_label: str,
45
+ step: int,
46
+ action: str,
47
+ reward: float,
48
+ cumulative: float,
49
+ done: bool,
50
+ llm_error: Optional[str] = None,
51
+ verbose: bool,
52
+ ) -> None:
53
+ if not verbose:
54
+ return
55
+ err = f" llm_error={llm_error}" if llm_error else ""
56
+ typer.echo(
57
+ f"[trace] policy={policy_label} step={step} action={action} "
58
+ f"reward={reward:+.3f} cum={cumulative:+.3f} done={done}{err}"
59
+ )
60
+
61
+ def _visible_observation_row(obs: Observation) -> Dict[str, float]:
62
+ """Public observation values available to the policy before it acts."""
63
+ return {
64
+ "provider_a_status": round(float(obs.provider_a_status), 4),
65
+ "provider_b_status": round(float(obs.provider_b_status), 4),
66
+ "provider_c_status": round(float(obs.provider_c_status), 4),
67
+ "observed_budget_remaining": round(float(obs.budget_remaining), 4),
68
+ "queue_backlog": round(float(obs.queue_backlog), 4),
69
+ "system_latency": round(float(obs.system_latency), 4),
70
+ "step_count": round(float(obs.step_count), 4),
71
+ }
72
+
73
+
74
+ def _visible_observation_row_from_array(values: Any) -> Dict[str, float]:
75
+ """Public observation values from the Gym wrapper's 7-field observation array."""
76
+ return {
77
+ "provider_a_status": round(float(values[0]), 4),
78
+ "provider_b_status": round(float(values[1]), 4),
79
+ "provider_c_status": round(float(values[2]), 4),
80
+ "observed_budget_remaining": round(float(values[3]), 4),
81
+ "queue_backlog": round(float(values[4]), 4),
82
+ "system_latency": round(float(values[5]), 4),
83
+ "step_count": round(float(values[6]), 4),
84
+ }
85
+
86
+
87
+ def _cumulative_step_rows(
88
+ history: List[Dict[str, Any]],
89
+ visible_observations: List[Dict[str, float]],
90
+ ) -> List[Dict[str, Any]]:
91
+ rows: List[Dict[str, Any]] = []
92
+ cumulative_reward = 0.0
93
+ cumulative_cost = 0.0
94
+
95
+ for item in history:
96
+ reward = float(item.get("reward", 0.0) or 0.0)
97
+ cost = float(item.get("cost", 0.0) or 0.0)
98
+ initial_budget = float(item.get("initial_budget", 0.0) or 0.0)
99
+ cumulative_reward += reward
100
+ cumulative_cost += cost
101
+ budget_remaining = max(0.0, initial_budget - cumulative_cost)
102
+
103
+ obs_row = visible_observations[len(rows)] if len(rows) < len(visible_observations) else {}
104
+ rows.append({
105
+ "step": int(item.get("step", len(rows) + 1)),
106
+ "action": item.get("action_type"),
107
+ "provider": item.get("provider"),
108
+ "success": bool(item.get("request_succeeded", False)),
109
+ "reward": round(reward, 4),
110
+ "cumulative_reward": round(cumulative_reward, 4),
111
+ "cost": round(cost, 4),
112
+ "budget_remaining": round(budget_remaining, 4),
113
+ "latency_ms": float(item.get("latency_ms", 0.0) or 0.0),
114
+ "queue_overflow": bool(item.get("queue_overflow", False)),
115
+ "budget_exhausted": bool(item.get("budget_exhausted", False)),
116
+ **obs_row,
117
+ })
118
+
119
+ return rows
120
+
121
+
122
+ def _run_heuristic(
123
+ task_cfg: TaskConfig, seed: int, *, verbose: bool = False
124
+ ) -> tuple[BudgetRouterEnv, List[Dict[str, float]]]:
125
+ env = BudgetRouterEnv()
126
+ obs = env.reset(seed=seed, scenario=task_cfg)
127
+ visible_observations = []
128
+ cumulative = 0.0
129
+ while not obs.done:
130
+ visible_observations.append(_visible_observation_row(obs))
131
+ action = heuristic_baseline_policy(obs)
132
+ action_str = action.action_type.value
133
+ obs = env.step(action)
134
+ r = float(obs.reward or 0.0)
135
+ cumulative += r
136
+ _echo_step_progress(
137
+ policy_label="heuristic",
138
+ step=int(env._internal.current_step),
139
+ action=action_str,
140
+ reward=r,
141
+ cumulative=cumulative,
142
+ done=bool(obs.done),
143
+ verbose=verbose,
144
+ )
145
+ return env, visible_observations
146
+
147
+
148
+ def _run_llm(
149
+ task_name: str, task_cfg: TaskConfig, seed: int, *, verbose: bool = False
150
+ ) -> tuple[BudgetRouterEnv, List[Dict[str, float]]]:
151
+ api_key = os.getenv("API_KEY") or os.getenv("HF_TOKEN")
152
+ api_base_url = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
153
+ model_name = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
154
+ if not api_key:
155
+ raise RuntimeError("LLM policy requires HF_TOKEN or API_KEY.")
156
+
157
+ policy = LLMRouter(api_base_url=api_base_url, model_name=model_name, api_key=api_key)
158
+ policy.reset(task_name=task_name)
159
+
160
+ env = BudgetRouterEnv()
161
+ obs = env.reset(seed=seed, scenario=task_cfg)
162
+ visible_observations = []
163
+ cumulative = 0.0
164
+ if verbose:
165
+ typer.echo(
166
+ f"[trace] begin policy=llm task={task_name} seed={seed} "
167
+ f"endpoint={api_base_url} model={model_name} "
168
+ f"(~{task_cfg.max_steps} sequential LLM calls; first call starting…)"
169
+ )
170
+ while not obs.done:
171
+ visible_observations.append(_visible_observation_row(obs))
172
+ action = policy.choose_action(obs)
173
+ action_str = action.action_type.value
174
+ obs = env.step(action)
175
+ r = float(obs.reward or 0.0)
176
+ cumulative += r
177
+ _echo_step_progress(
178
+ policy_label="llm",
179
+ step=int(env._internal.current_step),
180
+ action=action_str,
181
+ reward=r,
182
+ cumulative=cumulative,
183
+ done=bool(obs.done),
184
+ llm_error=policy.last_error,
185
+ verbose=verbose,
186
+ )
187
+ return env, visible_observations
188
+
189
+
190
+ def _default_ppo_model_path(task_name: str) -> Path:
191
+ if task_name not in DEFAULT_PPO_MODELS:
192
+ raise ValueError(
193
+ f"No default PPO model for task '{task_name}'. "
194
+ "Pass --model-path explicitly, or use task easy/hard_multi."
195
+ )
196
+ return DEFAULT_PPO_MODELS[task_name]
197
+
198
+
199
+ def _run_ppo(
200
+ task_name: str,
201
+ task_cfg: TaskConfig,
202
+ seed: int,
203
+ model_path: Optional[Path],
204
+ *,
205
+ verbose: bool = False,
206
+ ) -> tuple[BudgetRouterEnv, List[Dict[str, float]]]:
207
+ # Lazy import keeps heuristic/LLM tracing available without training extras.
208
+ try:
209
+ from stable_baselines3 import PPO
210
+ from train.gym_wrapper import BudgetRouterGymEnv
211
+ except ImportError as exc:
212
+ raise RuntimeError("PPO tracing requires training dependencies: `uv sync --extra training`.") from exc
213
+
214
+ resolved_model_path = model_path or _default_ppo_model_path(task_name)
215
+ if not resolved_model_path.exists():
216
+ raise FileNotFoundError(f"PPO model not found: {resolved_model_path}")
217
+
218
+ model = PPO.load(str(resolved_model_path))
219
+ gym_env = BudgetRouterGymEnv(scenario=task_cfg, seed=seed)
220
+ obs, _ = gym_env.reset()
221
+ done = False
222
+ visible_observations = []
223
+ cumulative = 0.0
224
+ while not done:
225
+ visible_observations.append(_visible_observation_row_from_array(obs))
226
+ action_idx, _ = model.predict(obs, deterministic=True)
227
+ ai = int(action_idx)
228
+ action_str = _PPO_ACTION_NAMES[ai] if 0 <= ai < len(_PPO_ACTION_NAMES) else str(ai)
229
+ obs, reward, terminated, truncated, _ = gym_env.step(ai)
230
+ r = float(reward)
231
+ cumulative += r
232
+ done = terminated or truncated
233
+ inner = gym_env._env
234
+ _echo_step_progress(
235
+ policy_label="ppo",
236
+ step=int(inner._internal.current_step),
237
+ action=action_str,
238
+ reward=r,
239
+ cumulative=cumulative,
240
+ done=done,
241
+ verbose=verbose,
242
+ )
243
+
244
+ return gym_env._env, visible_observations
245
+
246
+
247
+ def trace_episode(
248
+ task_name: str,
249
+ seed: int,
250
+ policy_name: str,
251
+ model_path: Optional[Path] = None,
252
+ *,
253
+ verbose: bool = False,
254
+ ) -> Dict[str, Any]:
255
+ """Run one episode and return step rows plus final scorer outputs."""
256
+ if task_name not in TASK_PRESETS:
257
+ raise ValueError(f"Unknown task '{task_name}'. Choose from: {sorted(TASK_PRESETS)}")
258
+ if policy_name not in POLICIES:
259
+ raise ValueError(f"Unknown policy '{policy_name}'. Choose from: {sorted(POLICIES)}")
260
+
261
+ task_cfg = TASK_PRESETS[task_name]
262
+ if policy_name == "heuristic":
263
+ env, visible_observations = _run_heuristic(task_cfg=task_cfg, seed=seed, verbose=verbose)
264
+ elif policy_name == "llm":
265
+ env, visible_observations = _run_llm(
266
+ task_name=task_name, task_cfg=task_cfg, seed=seed, verbose=verbose
267
+ )
268
+ else:
269
+ env, visible_observations = _run_ppo(
270
+ task_name=task_name,
271
+ task_cfg=task_cfg,
272
+ seed=seed,
273
+ model_path=model_path,
274
+ verbose=verbose,
275
+ )
276
+
277
+ history = env._internal.history
278
+ steps = _cumulative_step_rows(history, visible_observations)
279
+ grader = {k: round(float(v), 4) for k, v in grade_episode(history).items()}
280
+
281
+ return {
282
+ "task": task_name,
283
+ "seed": seed,
284
+ "policy": policy_name,
285
+ "episode_length": len(steps),
286
+ "total_reward": round(sum(row["reward"] for row in steps), 4),
287
+ "grader": grader,
288
+ "metrics": episode_metrics(history),
289
+ "steps": steps,
290
+ }
291
+
292
+
293
+ def _print_trace(result: Dict[str, Any]) -> None:
294
+ typer.echo(f"Task={result['task']} Policy={result['policy']} Seed={result['seed']}")
295
+ typer.echo(f"Episode length={result['episode_length']} Total reward={result['total_reward']:+.4f}")
296
+ typer.echo("Grader:")
297
+ for key, value in result["grader"].items():
298
+ typer.echo(f" {key}: {value:.4f}")
299
+
300
+ typer.echo("")
301
+ typer.echo(
302
+ "Step | A_stat | B_stat | C_stat | Action | Provider | Success | "
303
+ "Reward | CumReward | Cost | Budget | Latency | Flags"
304
+ )
305
+ typer.echo(
306
+ "-----|--------|--------|--------|-------------|----------|---------|"
307
+ "---------|-----------|------|--------|---------|------"
308
+ )
309
+ for row in result["steps"]:
310
+ flags = []
311
+ if row["queue_overflow"]:
312
+ flags.append("queue_overflow")
313
+ if row["budget_exhausted"]:
314
+ flags.append("budget_exhausted")
315
+ typer.echo(
316
+ f"{row['step']:>4} | {row.get('provider_a_status', 0.0):>6.3f} | "
317
+ f"{row.get('provider_b_status', 0.0):>6.3f} | "
318
+ f"{row.get('provider_c_status', 0.0):>6.3f} | "
319
+ f"{row['action']:<11} | {str(row['provider'] or '-'):>8} | "
320
+ f"{str(row['success']).lower():>7} | {row['reward']:>+7.2f} | "
321
+ f"{row['cumulative_reward']:>+9.2f} | {row['cost']:>4.2f} | "
322
+ f"{row['budget_remaining']:>6.2f} | {row['latency_ms']:>7.2f} | {','.join(flags) or '-'}"
323
+ )
324
+
325
+
326
+ @app.command()
327
+ def main(
328
+ task: str = typer.Option("hard_multi", help=f"Task name: {' | '.join(TASK_PRESETS)}"),
329
+ seed: int = typer.Option(..., help="Exact episode seed."),
330
+ policy: str = typer.Option("heuristic", help=f"Policy: {' | '.join(sorted(POLICIES))}"),
331
+ model_path: Optional[Path] = typer.Option(None, help="PPO model path. Defaults exist for easy/hard_multi."),
332
+ output_json: Optional[Path] = typer.Option(None, help="Optional path to save the full trace JSON."),
333
+ verbose: bool = typer.Option(
334
+ False,
335
+ "--verbose",
336
+ "-v",
337
+ help="Print one line per env step during the episode (useful for slow LLM runs).",
338
+ ),
339
+ ) -> None:
340
+ """Run and print a single exact-seed episode trace."""
341
+ result = trace_episode(
342
+ task_name=task,
343
+ seed=seed,
344
+ policy_name=policy,
345
+ model_path=model_path,
346
+ verbose=verbose,
347
+ )
348
+ _print_trace(result)
349
+
350
+ if output_json is not None:
351
+ output_json.parent.mkdir(parents=True, exist_ok=True)
352
+ output_json.write_text(json.dumps(result, indent=2) + "\n", encoding="utf-8")
353
+ typer.echo(f"\nSaved trace JSON: {output_json}")
354
+
355
+
356
+ if __name__ == "__main__":
357
+ app()
eval_sft.py ADDED
@@ -0,0 +1,488 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # /// script
3
+ # dependencies = [
4
+ # "torch",
5
+ # "transformers>=4.45.0",
6
+ # "huggingface_hub>=0.24.0",
7
+ # "scipy",
8
+ # "budget-router @ git+https://huggingface.co/spaces/akshay4/budget-router-openenv",
9
+ # ]
10
+ # ///
11
+ """Evaluate a Budget Router SFT model against the heuristic baseline."""
12
+
13
+ from __future__ import annotations
14
+
15
+ import argparse
16
+ import json
17
+ import math
18
+ import os
19
+ import time
20
+ from pathlib import Path
21
+ from typing import Any
22
+
23
+ import numpy as np
24
+
25
+ from budget_router.environment import BudgetRouterEnv
26
+ from budget_router.models import Action, ActionType, Observation, TaskConfig
27
+ from budget_router.policies import heuristic_baseline_policy
28
+ from budget_router.reward import episode_metrics, grade_episode
29
+ from budget_router.tasks import HARD_MULTI, TASK_PRESETS
30
+ try:
31
+ from inference import SYSTEM_PROMPT
32
+
33
+ _SYSTEM_PROMPT_SOURCE = "inference"
34
+ except ModuleNotFoundError as exc:
35
+ if exc.name != "inference":
36
+ raise
37
+ SYSTEM_PROMPT = """
38
+ You are a cost-aware LLM API routing agent managing a production system.
39
+ At each step, output EXACTLY ONE action string. Nothing else.
40
+
41
+ ENVIRONMENT:
42
+ Three providers: A ($0.01/req, cheapest), B ($0.05/req), C ($0.10/req, most reliable).
43
+ provider_X_status = windowed success rate [0=always fails, 1=always succeeds].
44
+ IMPORTANT: A status of exactly 0.500 means this provider has NEVER been routed to
45
+ in this episode — it is unobserved, not confirmed healthy. Route to it once to get
46
+ a real reading. Do not treat 0.500 as a health signal.
47
+ budget_remaining: fraction of budget left. Reaching 0 = catastrophic -10 penalty.
48
+ step_count [0→1], steps_remaining: episode progress (20 steps total).
49
+
50
+ VALID ACTIONS (output ONLY one):
51
+ route_to_a | route_to_b | route_to_c | shed_load
52
+
53
+ GOLDEN RULE — DEFAULT STRATEGY:
54
+ Stay on the CHEAPEST provider whose status > 0.52. Only deviate if there is CLEAR, SUSTAINED evidence of degradation (defined below). Unnecessary switching to expensive providers burns budget and reduces your score.
55
+
56
+ NOISE CALIBRATION (critical):
57
+ - Status fluctuates due to Bernoulli sampling noise. Single-step dips are not reliable signals.
58
+ - Use the provided 2-step trend (avg/step): a sustained negative trend across multiple steps
59
+ indicates real degradation; a trend near 0 means the provider is stable. Do NOT switch on noise.
60
+ - REAL degradation signal: sustained negative trend AND current status is visibly declining.
61
+ - Only when both conditions hold across consecutive observations should you consider early switching.
62
+ - On stable tasks, trends hover near zero. Switching on noise burns budget without benefit.
63
+
64
+
65
+ WHEN TO SWITCH (use your conversation history):
66
+ A → B: When trend_a is clearly and consistently negative AND status_a is approaching unreliable,
67
+ OR status_a is already below 0.52 (failure probability exceeds success probability).
68
+ B → C: Same principle — sustained decline signals, not single-step noise.
69
+ Never switch based on a single bad observation — noise causes occasional dips.
70
+
71
+ BUDGET RUNWAY — HARD CONSTRAINT:
72
+ budget_runway_at_current_rate shows how many more steps you can afford at current spend rate.
73
+ If budget_runway_at_current_rate < steps_remaining: switch to a cheaper provider IMMEDIATELY.
74
+ If budget_remaining < 0.15 (less than 15% left): treat C as OFF-LIMITS unless A and B are
75
+ both below 0.30 status. Prefer shed_load over routing C when budget is this low.
76
+ NEVER route to any provider if doing so would leave budget_remaining below the cost of
77
+ that provider times the steps_remaining. The -10 bankruptcy penalty destroys all episode
78
+ value accumulated so far — budget survival is non-negotiable.
79
+ TASK PROFILES (the task name appears in each observation — use it):
80
+ easy: Stable environment. Trend fluctuations are mostly noise. Stay on the cheapest provider unless its trend is catastrophically and sustainedly negative.
81
+ medium: Dynamic environment. A provider may degrade mid-episode. Monitor trends and switch to the next cheapest healthy fallback if the primary fails.
82
+ hard / hard_multi: Hostile, multi-failure environments. Multiple providers may degrade at unexpected times in unpredictable sequences.
83
+ Your Runbook: Always map traffic to the lowest-cost healthy provider (A=$0.01, B=$0.05, C=$0.10).
84
+ Watch your conversation history: if your currently active provider shows a clear, sustained negative trend, switch early to the next cheapest option that is healthy.
85
+ CRITICAL: Before switching to expensive fallbacks (like C), use budget_runway to verify you can afford them to prevent budget exhaustion.
86
+
87
+ Output only the action string."""
88
+ _SYSTEM_PROMPT_SOURCE = "embedded_fallback"
89
+
90
+ _AGENT_DEBUG_LOG = "/Users/akshaybabbar/Desktop/work/.cursor/debug-e4cac3.log"
91
+
92
+
93
+ def _agent_debug_ndjson(payload: dict[str, object]) -> None:
94
+ line = json.dumps(payload)
95
+ try:
96
+ with open(_AGENT_DEBUG_LOG, "a", encoding="utf-8") as f:
97
+ f.write(line + "\n")
98
+ except OSError:
99
+ print(f"[agent-debug] {line}", flush=True)
100
+
101
+
102
+ VALID_ACTIONS = ["route_to_a", "route_to_b", "route_to_c", "shed_load"]
103
+ DEFAULT_MODEL_REPO = "akshay4/budget-router-sft-qwen1.5b"
104
+
105
+
106
+ def _steps_remaining(obs: Observation, max_steps: int = 20) -> int:
107
+ elapsed = int(round(float(obs.step_count) * max_steps))
108
+ return max(0, max_steps - elapsed)
109
+
110
+
111
+ def _trend_text(obs: Observation, previous_obs: Observation | None, previous2_obs: Observation | None) -> str:
112
+ if previous2_obs is not None:
113
+ ta = (obs.provider_a_status - previous2_obs.provider_a_status) / 2.0
114
+ tb = (obs.provider_b_status - previous2_obs.provider_b_status) / 2.0
115
+ tc = (obs.provider_c_status - previous2_obs.provider_c_status) / 2.0
116
+ return f"trend (avg/step, 2-step): A:{ta:+.3f} B:{tb:+.3f} C:{tc:+.3f}"
117
+ if previous_obs is not None:
118
+ ta = obs.provider_a_status - previous_obs.provider_a_status
119
+ tb = obs.provider_b_status - previous_obs.provider_b_status
120
+ tc = obs.provider_c_status - previous_obs.provider_c_status
121
+ return f"trend (1-step only, noisy): A:{ta:+.3f} B:{tb:+.3f} C:{tc:+.3f}"
122
+ return "trend: unavailable"
123
+
124
+
125
+ def _budget_runway_text(obs: Observation, previous_obs: Observation | None) -> str:
126
+ if previous_obs is None:
127
+ return "budget_runway_at_current_rate: >20 steps"
128
+ budget_spent = float(previous_obs.budget_remaining) - float(obs.budget_remaining)
129
+ if budget_spent <= 0.001:
130
+ return "budget_runway_at_current_rate: >20 steps"
131
+ runway = int(float(obs.budget_remaining) / budget_spent)
132
+ return f"budget_runway_at_current_rate: ~{runway} steps"
133
+
134
+
135
+ def _previous_step_feedback(obs: Observation) -> str:
136
+ metadata = getattr(obs, "metadata", None) or {}
137
+ if not metadata.get("action_type"):
138
+ return ""
139
+ parts = [
140
+ "previous_step_feedback:",
141
+ f" previous_action: {metadata.get('action_type')}",
142
+ ]
143
+ if obs.reward is not None:
144
+ parts.append(f" previous_reward: {float(obs.reward):+.2f}")
145
+ if metadata.get("request_succeeded") is not None:
146
+ parts.append(f" previous_success: {str(bool(metadata.get('request_succeeded'))).lower()}")
147
+ if metadata.get("cost") is not None:
148
+ parts.append(f" previous_cost: {float(metadata.get('cost')):.2f}")
149
+ if metadata.get("latency_ms") is not None:
150
+ parts.append(f" previous_latency_ms: {float(metadata.get('latency_ms')):.2f}")
151
+ if metadata.get("budget_exhausted"):
152
+ parts.append(" previous_budget_exhausted: true")
153
+ return "\n".join(parts)
154
+
155
+
156
+ def format_observation_for_sft(
157
+ *,
158
+ obs: Observation,
159
+ task_name: str,
160
+ previous_obs: Observation | None,
161
+ previous2_obs: Observation | None,
162
+ ) -> str:
163
+ lines = [
164
+ f"task: {task_name}",
165
+ f"provider_a_status: {obs.provider_a_status:.3f}",
166
+ f"provider_b_status: {obs.provider_b_status:.3f}",
167
+ f"provider_c_status: {obs.provider_c_status:.3f}",
168
+ f"budget_remaining: {obs.budget_remaining:.3f}",
169
+ f"queue_backlog: {obs.queue_backlog:.3f}",
170
+ f"system_latency: {obs.system_latency:.3f}",
171
+ f"step_count: {obs.step_count:.3f}",
172
+ f"steps_remaining: {_steps_remaining(obs)}",
173
+ _trend_text(obs, previous_obs, previous2_obs),
174
+ _budget_runway_text(obs, previous_obs),
175
+ ]
176
+ feedback = _previous_step_feedback(obs)
177
+ if feedback:
178
+ lines.append(feedback)
179
+ return "\n".join(lines)
180
+
181
+
182
+ def parse_action(text: str) -> tuple[str, bool]:
183
+ lowered = text.strip().lower()
184
+ for action in VALID_ACTIONS:
185
+ if action in lowered:
186
+ return action, True
187
+ return "route_to_a", False
188
+
189
+
190
+ def apply_budget_safety_guard(action_str: str, observation: Observation, task_cfg: TaskConfig) -> str:
191
+ if action_str == "shed_load":
192
+ return action_str
193
+ costs = {
194
+ "route_to_a": task_cfg.cost_a,
195
+ "route_to_b": task_cfg.cost_b,
196
+ "route_to_c": task_cfg.cost_c,
197
+ }
198
+ selected_cost = costs.get(action_str, 0.0)
199
+ budget_dollars = float(observation.budget_remaining) * float(task_cfg.initial_budget)
200
+ if selected_cost >= budget_dollars - 1e-9:
201
+ return "shed_load"
202
+ return action_str
203
+
204
+
205
+ def run_heuristic_episode(task_cfg: TaskConfig, seed: int) -> dict[str, Any]:
206
+ env = BudgetRouterEnv()
207
+ obs = env.reset(seed=seed, scenario=task_cfg)
208
+ total_reward = 0.0
209
+ while not obs.done:
210
+ obs = env.step(heuristic_baseline_policy(obs))
211
+ total_reward += float(obs.reward or 0.0)
212
+ grader = grade_episode(env._internal.history)
213
+ metrics = episode_metrics(env._internal.history)
214
+ return {
215
+ "grader_score": float(grader["overall_score"]),
216
+ "total_reward": total_reward,
217
+ "episode_length": env._internal.current_step,
218
+ "grader": grader,
219
+ "metrics": metrics,
220
+ }
221
+
222
+
223
+ class SFTPolicy:
224
+ def __init__(self, model_repo: str, *, token: str | None, use_budget_guard: bool) -> None:
225
+ import torch
226
+ from transformers import AutoModelForCausalLM, AutoTokenizer
227
+
228
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
229
+ dtype = torch.bfloat16 if self.device == "cuda" and torch.cuda.is_bf16_supported() else torch.float16
230
+ self.model = AutoModelForCausalLM.from_pretrained(model_repo, torch_dtype=dtype, token=token)
231
+ self.model.to(self.device)
232
+ self.model.eval()
233
+ self.tokenizer = AutoTokenizer.from_pretrained(model_repo, token=token)
234
+ if self.tokenizer.pad_token is None:
235
+ self.tokenizer.pad_token = self.tokenizer.eos_token
236
+ self.use_budget_guard = use_budget_guard
237
+ self.messages: list[dict[str, str]] = []
238
+ self.previous_obs: Observation | None = None
239
+ self.previous2_obs: Observation | None = None
240
+ self.parse_failures = 0
241
+
242
+ def reset(self) -> None:
243
+ self.messages = [{"role": "system", "content": SYSTEM_PROMPT}]
244
+ self.previous_obs = None
245
+ self.previous2_obs = None
246
+ self.parse_failures = 0
247
+
248
+ def choose_action(self, obs: Observation, *, task_name: str, task_cfg: TaskConfig) -> str:
249
+ import torch
250
+
251
+ obs_text = format_observation_for_sft(
252
+ obs=obs,
253
+ task_name=task_name,
254
+ previous_obs=self.previous_obs,
255
+ previous2_obs=self.previous2_obs,
256
+ )
257
+ self.messages.append({"role": "user", "content": obs_text})
258
+ prompt = self.tokenizer.apply_chat_template(
259
+ self.messages,
260
+ tokenize=False,
261
+ add_generation_prompt=True,
262
+ )
263
+ inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
264
+ with torch.no_grad():
265
+ output = self.model.generate(
266
+ **inputs,
267
+ max_new_tokens=10,
268
+ do_sample=False,
269
+ pad_token_id=self.tokenizer.eos_token_id,
270
+ )
271
+ generated = self.tokenizer.decode(
272
+ output[0][inputs["input_ids"].shape[1] :],
273
+ skip_special_tokens=True,
274
+ )
275
+ action_str, ok = parse_action(generated)
276
+ if not ok:
277
+ self.parse_failures += 1
278
+ if self.use_budget_guard:
279
+ action_str = apply_budget_safety_guard(action_str, obs, task_cfg)
280
+ self.messages.append({"role": "assistant", "content": action_str})
281
+ self.previous2_obs = self.previous_obs
282
+ self.previous_obs = obs
283
+ return action_str
284
+
285
+
286
+ def run_sft_episode(policy: SFTPolicy, task_name: str, task_cfg: TaskConfig, seed: int) -> dict[str, Any]:
287
+ env = BudgetRouterEnv()
288
+ policy.reset()
289
+ obs = env.reset(seed=seed, scenario=task_cfg)
290
+ total_reward = 0.0
291
+ actions: list[str] = []
292
+ while not obs.done:
293
+ action_str = policy.choose_action(obs, task_name=task_name, task_cfg=task_cfg)
294
+ actions.append(action_str)
295
+ obs = env.step(Action(action_type=ActionType(action_str)))
296
+ total_reward += float(obs.reward or 0.0)
297
+ grader = grade_episode(env._internal.history)
298
+ metrics = episode_metrics(env._internal.history)
299
+ return {
300
+ "grader_score": float(grader["overall_score"]),
301
+ "total_reward": total_reward,
302
+ "episode_length": env._internal.current_step,
303
+ "grader": grader,
304
+ "metrics": metrics,
305
+ "actions": actions,
306
+ "parse_failures": policy.parse_failures,
307
+ }
308
+
309
+
310
+ def _mean(values: list[float]) -> float:
311
+ return float(sum(values) / len(values)) if values else 0.0
312
+
313
+
314
+ def _sample_std(values: list[float]) -> float:
315
+ if len(values) < 2:
316
+ return 0.0
317
+ mean = _mean(values)
318
+ return float(math.sqrt(sum((v - mean) ** 2 for v in values) / (len(values) - 1)))
319
+
320
+
321
+ def compute_paired_stats(heuristic_scores: list[float], sft_scores: list[float]) -> dict[str, Any]:
322
+ if len(heuristic_scores) != len(sft_scores):
323
+ raise ValueError("Paired stats require equal-length score lists.")
324
+ if not heuristic_scores:
325
+ raise ValueError("No scores provided.")
326
+
327
+ diffs = [s - h for h, s in zip(heuristic_scores, sft_scores)]
328
+ n = len(diffs)
329
+ delta = _mean(diffs)
330
+ std_diff = _sample_std(diffs)
331
+ if std_diff == 0.0:
332
+ t_stat = math.inf if delta > 0 else (-math.inf if delta < 0 else 0.0)
333
+ p_val = 0.0 if delta > 0 else 1.0
334
+ cohens_d = math.inf if delta > 0 else (-math.inf if delta < 0 else 0.0)
335
+ else:
336
+ try:
337
+ from scipy import stats
338
+
339
+ t_stat, p_val = stats.ttest_rel(sft_scores, heuristic_scores, alternative="greater")
340
+ cohens_d = delta / std_diff
341
+ except Exception:
342
+ t_stat = delta / (std_diff / math.sqrt(n))
343
+ p_val = float("nan")
344
+ cohens_d = delta / std_diff
345
+
346
+ return {
347
+ "n_seeds": n,
348
+ "mean_heuristic": _mean(heuristic_scores),
349
+ "mean_sft": _mean(sft_scores),
350
+ "std_heuristic": _sample_std(heuristic_scores),
351
+ "std_sft": _sample_std(sft_scores),
352
+ "delta": delta,
353
+ "t_stat": float(t_stat),
354
+ "p_val": float(p_val),
355
+ "cohens_d": float(cohens_d),
356
+ "significant": bool(delta > 0 and p_val < 0.05),
357
+ "wins": sum(1 for d in diffs if d > 0),
358
+ "ties": sum(1 for d in diffs if d == 0),
359
+ "losses": sum(1 for d in diffs if d < 0),
360
+ }
361
+
362
+
363
+ def _ci95(values: list[float]) -> tuple[float, float]:
364
+ n = len(values)
365
+ mean = _mean(values)
366
+ if n < 2:
367
+ return mean, mean
368
+ se = _sample_std(values) / math.sqrt(n)
369
+ try:
370
+ from scipy import stats
371
+
372
+ lo, hi = stats.t.interval(0.95, df=n - 1, loc=mean, scale=se)
373
+ return float(lo), float(hi)
374
+ except Exception:
375
+ return mean - 1.96 * se, mean + 1.96 * se
376
+
377
+
378
+ def _parse_seed_values(value: str | None, n_seeds: int) -> list[int]:
379
+ if value:
380
+ return [int(part) for part in value.replace(",", " ").split()]
381
+ return list(range(300, 300 + n_seeds))
382
+
383
+
384
+ def parse_args() -> argparse.Namespace:
385
+ parser = argparse.ArgumentParser(description="Evaluate SFT Budget Router model.")
386
+ parser.add_argument("--model-repo", default=os.getenv("SFT_MODEL_REPO", DEFAULT_MODEL_REPO))
387
+ parser.add_argument("--task", default=os.getenv("TASK_NAME", "hard_multi"), choices=sorted(TASK_PRESETS))
388
+ parser.add_argument("--n-seeds", type=int, default=int(os.getenv("N_SEEDS", "10")))
389
+ parser.add_argument("--seed-values", default=os.getenv("EVAL_SEED_VALUES"))
390
+ parser.add_argument("--output-json", default=os.getenv("EVAL_OUTPUT_JSON", "eval_results_sft.json"))
391
+ parser.add_argument("--no-budget-guard", action="store_true")
392
+ parser.add_argument("--no-upload", action="store_true")
393
+ return parser.parse_args()
394
+
395
+
396
+ def main() -> None:
397
+ args = parse_args()
398
+ token = os.environ.get("HF_TOKEN")
399
+ task_cfg = TASK_PRESETS[args.task]
400
+ seeds = _parse_seed_values(args.seed_values, args.n_seeds)
401
+ # #region agent log
402
+ _agent_debug_ndjson(
403
+ {
404
+ "sessionId": "e4cac3",
405
+ "runId": os.environ.get("DEBUG_RUN_ID", "eval-import-fix"),
406
+ "hypothesisId": "H1",
407
+ "location": "eval_sft.py:main",
408
+ "message": "eval_startup",
409
+ "data": {
410
+ "system_prompt_source": _SYSTEM_PROMPT_SOURCE,
411
+ "model_repo": args.model_repo,
412
+ "task": args.task,
413
+ "n_seeds": len(seeds),
414
+ },
415
+ "timestamp": int(time.time() * 1000),
416
+ }
417
+ )
418
+ # #endregion
419
+ policy = SFTPolicy(args.model_repo, token=token, use_budget_guard=not args.no_budget_guard)
420
+
421
+ episodes: list[dict[str, Any]] = []
422
+ heuristic_scores: list[float] = []
423
+ sft_scores: list[float] = []
424
+ for seed in seeds:
425
+ heuristic_ep = run_heuristic_episode(task_cfg, seed)
426
+ sft_ep = run_sft_episode(policy, args.task, task_cfg, seed)
427
+ heuristic_scores.append(float(heuristic_ep["grader_score"]))
428
+ sft_scores.append(float(sft_ep["grader_score"]))
429
+ episodes.append({"seed": seed, "heuristic": heuristic_ep, "sft": sft_ep})
430
+ print(
431
+ f"[eval-sft] seed={seed} heuristic={heuristic_ep['grader_score']:.4f} "
432
+ f"sft={sft_ep['grader_score']:.4f} delta={sft_ep['grader_score'] - heuristic_ep['grader_score']:+.4f} "
433
+ f"parse_failures={sft_ep['parse_failures']}",
434
+ flush=True,
435
+ )
436
+
437
+ stats = compute_paired_stats(heuristic_scores, sft_scores)
438
+ heu_ci = _ci95(heuristic_scores)
439
+ sft_ci = _ci95(sft_scores)
440
+ result = {
441
+ **stats,
442
+ "task": args.task,
443
+ "seeds": seeds,
444
+ "heuristic_scores": heuristic_scores,
445
+ "sft_scores": sft_scores,
446
+ "heuristic_ci95": heu_ci,
447
+ "sft_ci95": sft_ci,
448
+ "budget_guard": not args.no_budget_guard,
449
+ "episodes": episodes,
450
+ }
451
+ Path(args.output_json).write_text(json.dumps(result, indent=2, sort_keys=True), encoding="utf-8")
452
+
453
+ print()
454
+ print("| Policy | Mean | Std | 95% CI | vs Heuristic |")
455
+ print("|---|---:|---:|---|---:|")
456
+ print(
457
+ f"| Heuristic | {stats['mean_heuristic']:.3f} | {stats['std_heuristic']:.3f} | "
458
+ f"[{heu_ci[0]:.3f}, {heu_ci[1]:.3f}] | baseline |"
459
+ )
460
+ print(
461
+ f"| SFT | {stats['mean_sft']:.3f} | {stats['std_sft']:.3f} | "
462
+ f"[{sft_ci[0]:.3f}, {sft_ci[1]:.3f}] | {stats['delta']:+.3f} |"
463
+ )
464
+ verdict = "SIGNIFICANT" if stats["significant"] else "NOT SIGNIFICANT"
465
+ print(
466
+ f"SFT: {stats['mean_sft']:.3f} vs Heuristic: {stats['mean_heuristic']:.3f} | "
467
+ f"delta={stats['delta']:+.3f} | t({stats['n_seeds'] - 1})={stats['t_stat']:.2f}, "
468
+ f"p={stats['p_val']:.4f} | {verdict} | Cohen's d={stats['cohens_d']:.2f} | "
469
+ f"wins/ties/losses={stats['wins']}/{stats['ties']}/{stats['losses']}"
470
+ )
471
+
472
+ if not args.no_upload:
473
+ if not token:
474
+ raise RuntimeError("HF_TOKEN must be set to upload eval JSON. Use --no-upload to skip.")
475
+ from huggingface_hub import upload_file
476
+
477
+ upload_file(
478
+ path_or_fileobj=args.output_json,
479
+ path_in_repo=Path(args.output_json).name,
480
+ repo_id=args.model_repo,
481
+ repo_type="model",
482
+ token=token,
483
+ )
484
+ print(f"[eval-sft] uploaded {args.output_json} to {args.model_repo}", flush=True)
485
+
486
+
487
+ if __name__ == "__main__":
488
+ main()
generate_sft_data.py ADDED
@@ -0,0 +1,361 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Generate SFT data for Budget Router.
4
+
5
+ Default path is deliberately zero-API-cost: distill the existing PPO hard_multi
6
+ policy into chat transcripts, then push the dataset to the Hub for HF Jobs.
7
+
8
+ Optional LLM labeling is available with --teacher llm, but it costs one large
9
+ model call per environment step (20 calls per episode).
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import argparse
15
+ import json
16
+ import math
17
+ import os
18
+ from pathlib import Path
19
+ from typing import Any, Callable
20
+
21
+ import numpy as np
22
+
23
+ from budget_router.environment import BudgetRouterEnv
24
+ from budget_router.models import Action, ActionType, Observation, TaskConfig
25
+ from budget_router.policies import heuristic_baseline_policy
26
+ from budget_router.reward import episode_metrics, grade_episode
27
+ from budget_router.tasks import HARD_MULTI, TASK_PRESETS
28
+ from inference import LLMRouter, SYSTEM_PROMPT
29
+
30
+
31
+ VALID_ACTIONS = ["route_to_a", "route_to_b", "route_to_c", "shed_load"]
32
+ PPO_ACTION_NAMES = ["route_to_a", "route_to_b", "route_to_c", "shed_load"]
33
+ DEFAULT_DATASET_REPO = "akshay4/budget-router-sft-data"
34
+ DEFAULT_PPO_MODEL_PATH = "trained_models/ppo_hard_multi_100k.zip"
35
+ _PPO_POLICY_CACHE: dict[str, Callable[[Observation], str]] = {}
36
+
37
+
38
+ def _obs_to_array(obs: Observation) -> np.ndarray:
39
+ return np.array(
40
+ [
41
+ obs.provider_a_status,
42
+ obs.provider_b_status,
43
+ obs.provider_c_status,
44
+ obs.budget_remaining,
45
+ obs.queue_backlog,
46
+ obs.system_latency,
47
+ obs.step_count,
48
+ ],
49
+ dtype=np.float32,
50
+ )
51
+
52
+
53
+ def _steps_remaining(obs: Observation, max_steps: int = 20) -> int:
54
+ elapsed = int(round(float(obs.step_count) * max_steps))
55
+ return max(0, max_steps - elapsed)
56
+
57
+
58
+ def _trend_text(obs: Observation, previous_obs: Observation | None, previous2_obs: Observation | None) -> str:
59
+ if previous2_obs is not None:
60
+ ta = (obs.provider_a_status - previous2_obs.provider_a_status) / 2.0
61
+ tb = (obs.provider_b_status - previous2_obs.provider_b_status) / 2.0
62
+ tc = (obs.provider_c_status - previous2_obs.provider_c_status) / 2.0
63
+ return f"trend (avg/step, 2-step): A:{ta:+.3f} B:{tb:+.3f} C:{tc:+.3f}"
64
+ if previous_obs is not None:
65
+ ta = obs.provider_a_status - previous_obs.provider_a_status
66
+ tb = obs.provider_b_status - previous_obs.provider_b_status
67
+ tc = obs.provider_c_status - previous_obs.provider_c_status
68
+ return f"trend (1-step only, noisy): A:{ta:+.3f} B:{tb:+.3f} C:{tc:+.3f}"
69
+ return "trend: unavailable"
70
+
71
+
72
+ def _budget_runway_text(obs: Observation, previous_obs: Observation | None) -> str:
73
+ if previous_obs is None:
74
+ return "budget_runway_at_current_rate: >20 steps"
75
+ budget_spent = float(previous_obs.budget_remaining) - float(obs.budget_remaining)
76
+ if budget_spent <= 0.001:
77
+ return "budget_runway_at_current_rate: >20 steps"
78
+ runway = int(float(obs.budget_remaining) / budget_spent)
79
+ return f"budget_runway_at_current_rate: ~{runway} steps"
80
+
81
+
82
+ def _previous_step_feedback(obs: Observation) -> str:
83
+ metadata = getattr(obs, "metadata", None) or {}
84
+ if not metadata.get("action_type"):
85
+ return ""
86
+
87
+ parts = [
88
+ "previous_step_feedback:",
89
+ f" previous_action: {metadata.get('action_type')}",
90
+ ]
91
+ if obs.reward is not None:
92
+ parts.append(f" previous_reward: {float(obs.reward):+.2f}")
93
+ if metadata.get("request_succeeded") is not None:
94
+ parts.append(f" previous_success: {str(bool(metadata.get('request_succeeded'))).lower()}")
95
+ if metadata.get("cost") is not None:
96
+ parts.append(f" previous_cost: {float(metadata.get('cost')):.2f}")
97
+ if metadata.get("latency_ms") is not None:
98
+ parts.append(f" previous_latency_ms: {float(metadata.get('latency_ms')):.2f}")
99
+ if metadata.get("budget_exhausted"):
100
+ parts.append(" previous_budget_exhausted: true")
101
+ return "\n".join(parts)
102
+
103
+
104
+ def format_observation_for_sft(
105
+ *,
106
+ obs: Observation,
107
+ task_name: str,
108
+ previous_obs: Observation | None,
109
+ previous2_obs: Observation | None,
110
+ ) -> str:
111
+ """Public observation text used consistently for SFT train/eval."""
112
+ lines = [
113
+ f"task: {task_name}",
114
+ f"provider_a_status: {obs.provider_a_status:.3f}",
115
+ f"provider_b_status: {obs.provider_b_status:.3f}",
116
+ f"provider_c_status: {obs.provider_c_status:.3f}",
117
+ f"budget_remaining: {obs.budget_remaining:.3f}",
118
+ f"queue_backlog: {obs.queue_backlog:.3f}",
119
+ f"system_latency: {obs.system_latency:.3f}",
120
+ f"step_count: {obs.step_count:.3f}",
121
+ f"steps_remaining: {_steps_remaining(obs)}",
122
+ _trend_text(obs, previous_obs, previous2_obs),
123
+ _budget_runway_text(obs, previous_obs),
124
+ ]
125
+ feedback = _previous_step_feedback(obs)
126
+ if feedback:
127
+ lines.append(feedback)
128
+ return "\n".join(lines)
129
+
130
+
131
+ def run_heuristic_episode(task_cfg: TaskConfig, seed: int) -> dict[str, Any]:
132
+ env = BudgetRouterEnv()
133
+ obs = env.reset(seed=seed, scenario=task_cfg)
134
+ total_reward = 0.0
135
+ while not obs.done:
136
+ obs = env.step(heuristic_baseline_policy(obs))
137
+ total_reward += float(obs.reward or 0.0)
138
+ grader = grade_episode(env._internal.history)
139
+ return {
140
+ "grader_score": float(grader["overall_score"]),
141
+ "total_reward": total_reward,
142
+ "grader": grader,
143
+ }
144
+
145
+
146
+ def _load_ppo_policy(model_path: str) -> Callable[[Observation], str]:
147
+ if model_path in _PPO_POLICY_CACHE:
148
+ return _PPO_POLICY_CACHE[model_path]
149
+
150
+ try:
151
+ from stable_baselines3 import PPO
152
+ except ImportError as exc:
153
+ raise RuntimeError(
154
+ "PPO teacher requires training dependencies. Run `uv sync --extra training` "
155
+ "or use --teacher heuristic/llm."
156
+ ) from exc
157
+
158
+ path = Path(model_path)
159
+ if not path.exists():
160
+ raise FileNotFoundError(f"PPO model not found: {path}")
161
+ model = PPO.load(str(path))
162
+
163
+ def choose(obs: Observation) -> str:
164
+ action_idx, _ = model.predict(_obs_to_array(obs), deterministic=True)
165
+ idx = int(action_idx)
166
+ return PPO_ACTION_NAMES[idx] if 0 <= idx < len(PPO_ACTION_NAMES) else "shed_load"
167
+
168
+ _PPO_POLICY_CACHE[model_path] = choose
169
+ return choose
170
+
171
+
172
+ def _load_llm_policy(task_name: str) -> Callable[[Observation], str]:
173
+ api_key = os.environ.get("HF_TOKEN") or os.environ.get("API_KEY")
174
+ if not api_key:
175
+ raise RuntimeError("LLM teacher requires HF_TOKEN or API_KEY in the environment.")
176
+ router = LLMRouter(
177
+ api_base_url=os.environ.get("API_BASE_URL", "https://router.huggingface.co/v1"),
178
+ model_name=os.environ.get("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct"),
179
+ api_key=api_key,
180
+ )
181
+ router.reset(task_name=task_name)
182
+
183
+ def choose(obs: Observation) -> str:
184
+ return router.choose_action(obs).action_type.value
185
+
186
+ return choose
187
+
188
+
189
+ def collect_teacher_episode(
190
+ *,
191
+ task_name: str,
192
+ task_cfg: TaskConfig,
193
+ seed: int,
194
+ teacher: str,
195
+ ppo_model_path: str,
196
+ ) -> dict[str, Any]:
197
+ if teacher == "ppo":
198
+ choose_action = _load_ppo_policy(ppo_model_path)
199
+ elif teacher == "heuristic":
200
+ choose_action = lambda obs: heuristic_baseline_policy(obs).action_type.value
201
+ elif teacher == "llm":
202
+ choose_action = _load_llm_policy(task_name)
203
+ else:
204
+ raise ValueError(f"Unknown teacher {teacher!r}")
205
+
206
+ env = BudgetRouterEnv()
207
+ obs = env.reset(seed=seed, scenario=task_cfg)
208
+ messages = [{"role": "system", "content": SYSTEM_PROMPT}]
209
+ previous2_obs: Observation | None = None
210
+ previous_obs: Observation | None = None
211
+ actions: list[str] = []
212
+ total_reward = 0.0
213
+
214
+ while not obs.done:
215
+ obs_text = format_observation_for_sft(
216
+ obs=obs,
217
+ task_name=task_name,
218
+ previous_obs=previous_obs,
219
+ previous2_obs=previous2_obs,
220
+ )
221
+ action_str = choose_action(obs)
222
+ if action_str not in VALID_ACTIONS:
223
+ action_str = "shed_load"
224
+
225
+ messages.append({"role": "user", "content": obs_text})
226
+ messages.append({"role": "assistant", "content": action_str})
227
+ actions.append(action_str)
228
+
229
+ previous2_obs = previous_obs
230
+ previous_obs = obs
231
+ obs = env.step(Action(action_type=ActionType(action_str)))
232
+ total_reward += float(obs.reward or 0.0)
233
+
234
+ grader = grade_episode(env._internal.history)
235
+ return {
236
+ "seed": seed,
237
+ "teacher": teacher,
238
+ "messages": messages,
239
+ "actions": actions,
240
+ "grader_score": float(grader["overall_score"]),
241
+ "total_reward": total_reward,
242
+ "grader": grader,
243
+ "metrics": episode_metrics(env._internal.history),
244
+ }
245
+
246
+
247
+ def select_training_rows(
248
+ episodes: list[dict[str, Any]],
249
+ *,
250
+ top_fraction: float,
251
+ min_keep: int,
252
+ min_delta: float,
253
+ ) -> list[dict[str, Any]]:
254
+ ranked = sorted(episodes, key=lambda item: float(item["delta_vs_heuristic"]), reverse=True)
255
+ target = max(min_keep, int(math.ceil(len(ranked) * top_fraction)))
256
+ positive = [ep for ep in ranked if float(ep["delta_vs_heuristic"]) >= min_delta]
257
+ source = positive if len(positive) >= min_keep else ranked
258
+ return source[: min(target, len(source))]
259
+
260
+
261
+ def write_jsonl(path: Path, rows: list[dict[str, Any]]) -> None:
262
+ path.parent.mkdir(parents=True, exist_ok=True)
263
+ with path.open("w", encoding="utf-8") as f:
264
+ for row in rows:
265
+ f.write(json.dumps(row, sort_keys=True) + "\n")
266
+
267
+
268
+ def parse_args() -> argparse.Namespace:
269
+ parser = argparse.ArgumentParser(description="Generate Budget Router SFT dataset.")
270
+ parser.add_argument("--teacher", choices=["ppo", "heuristic", "llm"], default=os.getenv("TEACHER_POLICY", "ppo"))
271
+ parser.add_argument("--task", default=os.getenv("TASK_NAME", "hard_multi"), choices=sorted(TASK_PRESETS))
272
+ parser.add_argument("--start-seed", type=int, default=int(os.getenv("SFT_START_SEED", "1000")))
273
+ parser.add_argument("--n-episodes", type=int, default=int(os.getenv("SFT_N_EPISODES", "100")))
274
+ parser.add_argument("--top-fraction", type=float, default=float(os.getenv("SFT_TOP_FRACTION", "0.30")))
275
+ parser.add_argument("--min-keep", type=int, default=int(os.getenv("SFT_MIN_KEEP", "20")))
276
+ parser.add_argument("--min-delta", type=float, default=float(os.getenv("SFT_MIN_DELTA", "0.0")))
277
+ parser.add_argument("--ppo-model-path", default=os.getenv("PPO_MODEL_PATH", DEFAULT_PPO_MODEL_PATH))
278
+ parser.add_argument("--dataset-repo", default=os.getenv("DATASET_REPO", DEFAULT_DATASET_REPO))
279
+ parser.add_argument("--local-jsonl", default=os.getenv("SFT_LOCAL_JSONL", "outputs/sft_dataset.jsonl"))
280
+ parser.add_argument("--no-push", action="store_true", help="Write local JSONL only; do not push to Hub.")
281
+ return parser.parse_args()
282
+
283
+
284
+ def main() -> None:
285
+ args = parse_args()
286
+ task_cfg = TASK_PRESETS[args.task]
287
+ seeds = list(range(args.start_seed, args.start_seed + args.n_episodes))
288
+
289
+ if args.teacher == "llm":
290
+ print(
291
+ f"[sft-data] teacher=llm n_episodes={args.n_episodes}; "
292
+ f"expected large-model calls <= {args.n_episodes * task_cfg.max_steps}",
293
+ flush=True,
294
+ )
295
+ else:
296
+ print(f"[sft-data] teacher={args.teacher} uses 0 large-LLM calls", flush=True)
297
+
298
+ episodes: list[dict[str, Any]] = []
299
+ for i, seed in enumerate(seeds, start=1):
300
+ teacher_ep = collect_teacher_episode(
301
+ task_name=args.task,
302
+ task_cfg=task_cfg,
303
+ seed=seed,
304
+ teacher=args.teacher,
305
+ ppo_model_path=args.ppo_model_path,
306
+ )
307
+ heuristic_ep = run_heuristic_episode(task_cfg, seed)
308
+ delta = teacher_ep["grader_score"] - heuristic_ep["grader_score"]
309
+ teacher_ep["heuristic_score"] = heuristic_ep["grader_score"]
310
+ teacher_ep["delta_vs_heuristic"] = delta
311
+ episodes.append(teacher_ep)
312
+ print(
313
+ f"[sft-data] {i:03d}/{len(seeds)} seed={seed} "
314
+ f"teacher={teacher_ep['grader_score']:.4f} heuristic={heuristic_ep['grader_score']:.4f} "
315
+ f"delta={delta:+.4f}",
316
+ flush=True,
317
+ )
318
+
319
+ kept = select_training_rows(
320
+ episodes,
321
+ top_fraction=args.top_fraction,
322
+ min_keep=args.min_keep,
323
+ min_delta=args.min_delta,
324
+ )
325
+ dataset_rows = [
326
+ {
327
+ "messages": ep["messages"],
328
+ "seed": ep["seed"],
329
+ "teacher": ep["teacher"],
330
+ "teacher_score": ep["grader_score"],
331
+ "heuristic_score": ep["heuristic_score"],
332
+ "delta_vs_heuristic": ep["delta_vs_heuristic"],
333
+ "actions": ep["actions"],
334
+ }
335
+ for ep in kept
336
+ ]
337
+ write_jsonl(Path(args.local_jsonl), dataset_rows)
338
+
339
+ mean_all = sum(float(ep["grader_score"]) for ep in episodes) / len(episodes)
340
+ mean_kept = sum(float(ep["grader_score"]) for ep in kept) / len(kept)
341
+ mean_delta = sum(float(ep["delta_vs_heuristic"]) for ep in kept) / len(kept)
342
+ print(
343
+ "[sft-data] summary "
344
+ f"generated={len(episodes)} kept={len(kept)} mean_all={mean_all:.4f} "
345
+ f"mean_kept={mean_kept:.4f} mean_delta_kept={mean_delta:+.4f} "
346
+ f"local_jsonl={args.local_jsonl}",
347
+ flush=True,
348
+ )
349
+
350
+ if not args.no_push:
351
+ token = os.environ.get("HF_TOKEN")
352
+ if not token:
353
+ raise RuntimeError("HF_TOKEN must be set to push the dataset. Use --no-push for local only.")
354
+ from datasets import Dataset
355
+
356
+ Dataset.from_list(dataset_rows).push_to_hub(args.dataset_repo, token=token)
357
+ print(f"[sft-data] pushed dataset to https://huggingface.co/datasets/{args.dataset_repo}", flush=True)
358
+
359
+
360
+ if __name__ == "__main__":
361
+ main()
gradio_ui/__init__.py ADDED
File without changes
gradio_ui/config.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ MAX_STEPS = 20
4
+
5
+ SCENARIOS = ["easy", "medium", "hard", "hard_multi"]
6
+
7
+ POLICY_CHOICES = [
8
+ ("Heuristic", "heuristic"),
9
+ ("LLM", "llm"),
10
+ ]
11
+
12
+ try:
13
+ import stable_baselines3 # type: ignore # noqa: F401
14
+
15
+ POLICY_CHOICES.append(("PPO (hard_multi)", "ppo"))
16
+ except Exception:
17
+ pass
18
+
19
+ PPO_MODEL_PATH = "trained_models/ppo_hard_multi_100k.zip"
gradio_ui/legacy_api.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import Dict, Optional, Tuple
4
+
5
+ import requests
6
+
7
+ BASE_URL = "http://localhost:8000"
8
+ AUTO_PLAY_DELAY = 0.5
9
+
10
+
11
+ class APIClient:
12
+ """Single-responsibility HTTP client for the OpenEnv Budget Router API."""
13
+
14
+ def __init__(self, base_url: str = BASE_URL) -> None:
15
+ self.base_url = base_url.rstrip("/")
16
+
17
+ def _post(self, path: str, body: Dict) -> Tuple[Optional[Dict], Optional[str]]:
18
+ try:
19
+ r = requests.post(f"{self.base_url}{path}", json=body, timeout=15)
20
+ r.raise_for_status()
21
+ return r.json(), None
22
+ except Exception as exc:
23
+ return None, str(exc)
24
+
25
+ def _get(self, path: str) -> Tuple[Optional[Dict], Optional[str]]:
26
+ try:
27
+ r = requests.get(f"{self.base_url}{path}", timeout=10)
28
+ r.raise_for_status()
29
+ return r.json(), None
30
+ except Exception as exc:
31
+ return None, str(exc)
32
+
33
+ @staticmethod
34
+ def _normalize(payload: Dict):
35
+ """Handle both flat and observation-wrapped response shapes."""
36
+ obs = payload.get("observation", payload)
37
+ reward = float(payload.get("reward", obs.get("reward", 0.0)) or 0.0)
38
+ meta = payload.get("metadata", obs.get("metadata", {})) or {}
39
+ done = bool(payload.get("done", obs.get("done", False)))
40
+ return obs, reward, meta, done
41
+
42
+ def reset(self, seed: int, scenario: str):
43
+ data, err = self._post("/reset", {"seed": seed, "scenario": scenario})
44
+ if err:
45
+ return None, err
46
+ obs, _, _, _ = self._normalize(data)
47
+ return obs, None
48
+
49
+ def step(self, action_type: str):
50
+ data, err = self._post("/step", {"action_type": action_type})
51
+ if err:
52
+ return None, err
53
+ return self._normalize(data), None
54
+
55
+ def state(self):
56
+ return self._get("/state")