Cyber-Machine commited on
Commit
b522b5c
·
verified ·
1 Parent(s): fecc757

feat: implement grading system with task definitions and score extraction

Browse files
grade/common.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import re
5
+ import sys
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ MIN_SCORE = 0.01
10
+ MAX_SCORE = 0.99
11
+ END_SCORE_RE = re.compile(r"\[END\].*?\bscore=([0-9]+(?:\.[0-9]+)?)")
12
+ START_TASK_RE = re.compile(r"\[START\]\s+task=([^\s]+)")
13
+
14
+
15
+ def clamp_score(score: float) -> float:
16
+ return round(min(MAX_SCORE, max(MIN_SCORE, score)), 4)
17
+
18
+
19
+ def read_payload_text() -> str:
20
+ if len(sys.argv) > 1:
21
+ path = Path(sys.argv[1])
22
+ if path.exists():
23
+ return path.read_text()
24
+ return sys.stdin.read()
25
+
26
+
27
+ def _lookup_score(value: Any) -> float | None:
28
+ if isinstance(value, (int, float)):
29
+ return float(value)
30
+
31
+ if isinstance(value, dict):
32
+ for key in (
33
+ "score",
34
+ "benchmark_score",
35
+ "final_score",
36
+ "task_score",
37
+ ):
38
+ candidate = value.get(key)
39
+ if isinstance(candidate, (int, float)):
40
+ return float(candidate)
41
+
42
+ for key in (
43
+ "success_metrics",
44
+ "observation",
45
+ "final_observation",
46
+ "result",
47
+ "metrics",
48
+ ):
49
+ candidate = value.get(key)
50
+ if candidate is not None:
51
+ nested = _lookup_score(candidate)
52
+ if nested is not None:
53
+ return nested
54
+
55
+ if isinstance(value, list):
56
+ for item in value:
57
+ nested = _lookup_score(item)
58
+ if nested is not None:
59
+ return nested
60
+
61
+ return None
62
+
63
+
64
+ def extract_score(text: str) -> float:
65
+ stripped = text.strip()
66
+ if not stripped:
67
+ return MIN_SCORE
68
+
69
+ match = END_SCORE_RE.search(stripped)
70
+ if match:
71
+ return clamp_score(float(match.group(1)))
72
+
73
+ try:
74
+ payload = json.loads(stripped)
75
+ except json.JSONDecodeError:
76
+ return MIN_SCORE
77
+
78
+ score = _lookup_score(payload)
79
+ if score is None:
80
+ return MIN_SCORE
81
+ return clamp_score(score)
82
+
83
+
84
+ def extract_started_task(text: str) -> str | None:
85
+ match = START_TASK_RE.search(text)
86
+ if match:
87
+ return match.group(1)
88
+ return None
89
+
90
+
91
+ def emit_grade(expected_task: str) -> int:
92
+ text = read_payload_text()
93
+ observed_task = extract_started_task(text)
94
+ score = extract_score(text)
95
+ if observed_task is not None and observed_task != expected_task:
96
+ score = MIN_SCORE
97
+ print(
98
+ json.dumps(
99
+ {
100
+ "task_id": expected_task,
101
+ "score": score,
102
+ },
103
+ separators=(",", ":"),
104
+ )
105
+ )
106
+ return 0
grade/task_easy ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from common import emit_grade
3
+
4
+ raise SystemExit(emit_grade("easy"))
grade/task_hard ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from common import emit_grade
3
+
4
+ raise SystemExit(emit_grade("hard"))
grade/task_medium ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from common import emit_grade
3
+
4
+ raise SystemExit(emit_grade("medium"))
inference.py CHANGED
@@ -26,6 +26,7 @@ PRESETS = [
26
  PROJECT_DIR = Path(__file__).resolve().parent
27
  IMAGE_NAME = "workflow-arena-inference:latest"
28
  DOCKERFILE_PATH = PROJECT_DIR / "server" / "Dockerfile"
 
29
  TEMPERATURE = 0.0
30
  MAX_STEPS = 256
31
 
@@ -65,6 +66,10 @@ def log_end(success: bool, steps: int, score: float, rewards: list[float]) -> No
65
  )
66
 
67
 
 
 
 
 
68
  def compact_task(task: WorkflowTaskView) -> dict[str, object]:
69
  return {
70
  "task_id": task.task_id,
@@ -172,6 +177,40 @@ def action_to_log_string(action: WorkflowArenaAction) -> str:
172
  return json.dumps(payload, separators=(",", ":"))
173
 
174
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  def compute_score(observation: WorkflowArenaObservation) -> float:
176
  score = observation.benchmark_score
177
  if score is None:
@@ -233,12 +272,25 @@ def ensure_local_image() -> None:
233
 
234
  @asynccontextmanager
235
  async def managed_env():
 
 
 
 
 
 
 
 
 
 
236
  ensure_local_image()
237
  env = await WorkflowArenaEnv.from_docker_image(IMAGE_NAME)
238
  try:
239
  yield env
240
  finally:
241
- await env.close()
 
 
 
242
 
243
 
244
  async def run_episode(
@@ -255,10 +307,18 @@ async def run_episode(
255
 
256
  log_start(task=preset.value, env=BENCHMARK, model=model_name)
257
 
258
- result = await env.reset(
259
- seed=seed,
260
- preset=preset.value,
261
- )
 
 
 
 
 
 
 
 
262
  observation = result.observation
263
 
264
  while not observation.done and steps_taken < MAX_STEPS:
@@ -274,11 +334,21 @@ async def run_episode(
274
 
275
  try:
276
  result = await env.step(action)
277
- except (
278
- Exception
279
- ): # pragma: no cover - preserve log format and continue safely
280
- action = heuristic_action(observation)
281
- result = await env.step(action)
 
 
 
 
 
 
 
 
 
 
282
 
283
  observation = result.observation
284
  reward = float(result.reward or 0.0)
@@ -302,13 +372,7 @@ async def run_episode(
302
 
303
 
304
  async def main() -> None:
305
- api_base_url = os.environ["API_BASE_URL"]
306
- model_name = os.environ["MODEL_NAME"]
307
- api_key = os.getenv("HF_TOKEN") or os.getenv("OPENAI_API_KEY")
308
- if not api_key:
309
- raise RuntimeError("HF_TOKEN or OPENAI_API_KEY must be set.")
310
-
311
- client = OpenAI(base_url=api_base_url, api_key=api_key)
312
 
313
  async with managed_env() as env:
314
  for index, preset in enumerate(PRESETS):
@@ -322,4 +386,7 @@ async def main() -> None:
322
 
323
 
324
  if __name__ == "__main__":
325
- asyncio.run(main())
 
 
 
 
26
  PROJECT_DIR = Path(__file__).resolve().parent
27
  IMAGE_NAME = "workflow-arena-inference:latest"
28
  DOCKERFILE_PATH = PROJECT_DIR / "server" / "Dockerfile"
29
+ DEFAULT_BASE_URL = os.getenv("WORKFLOW_ARENA_BASE_URL", "http://localhost:8000")
30
  TEMPERATURE = 0.0
31
  MAX_STEPS = 256
32
 
 
66
  )
67
 
68
 
69
+ def log_warning(message: str) -> None:
70
+ print(f"[WARN] {message}", flush=True)
71
+
72
+
73
  def compact_task(task: WorkflowTaskView) -> dict[str, object]:
74
  return {
75
  "task_id": task.task_id,
 
177
  return json.dumps(payload, separators=(",", ":"))
178
 
179
 
180
+ def resolve_model_client() -> tuple[OpenAI | None, str]:
181
+ api_base_url = os.getenv("API_BASE_URL")
182
+ model_name = os.getenv("MODEL_NAME")
183
+ api_key = (
184
+ os.getenv("API_KEY")
185
+ or os.getenv("HF_TOKEN")
186
+ or os.getenv("OPENAI_API_KEY")
187
+ )
188
+ missing = []
189
+
190
+ if not api_base_url:
191
+ missing.append("API_BASE_URL")
192
+ if not model_name:
193
+ missing.append("MODEL_NAME")
194
+ if not api_key:
195
+ missing.append("API_KEY")
196
+
197
+ if missing:
198
+ log_warning(
199
+ "Missing model configuration ("
200
+ + ", ".join(missing)
201
+ + "). Falling back to heuristic policy."
202
+ )
203
+ return None, "heuristic"
204
+
205
+ try:
206
+ return OpenAI(base_url=api_base_url, api_key=api_key), model_name
207
+ except Exception as exc: # pragma: no cover - defensive initialization fallback
208
+ log_warning(
209
+ f"Failed to initialize model client: {exc}. Falling back to heuristic policy."
210
+ )
211
+ return None, "heuristic"
212
+
213
+
214
  def compute_score(observation: WorkflowArenaObservation) -> float:
215
  score = observation.benchmark_score
216
  if score is None:
 
272
 
273
  @asynccontextmanager
274
  async def managed_env():
275
+ try:
276
+ async with WorkflowArenaEnv(base_url=DEFAULT_BASE_URL) as env:
277
+ yield env
278
+ return
279
+ except Exception as exc:
280
+ log_warning(
281
+ f"Failed to connect to environment at {DEFAULT_BASE_URL}: {exc}. "
282
+ "Trying local Docker fallback."
283
+ )
284
+
285
  ensure_local_image()
286
  env = await WorkflowArenaEnv.from_docker_image(IMAGE_NAME)
287
  try:
288
  yield env
289
  finally:
290
+ try:
291
+ await env.close()
292
+ except Exception as exc: # pragma: no cover - teardown failures should not fail inference
293
+ log_warning(f"Failed to close Docker environment cleanly: {exc}")
294
 
295
 
296
  async def run_episode(
 
307
 
308
  log_start(task=preset.value, env=BENCHMARK, model=model_name)
309
 
310
+ try:
311
+ result = await env.reset(
312
+ seed=seed,
313
+ preset=preset.value,
314
+ )
315
+ except Exception as exc: # pragma: no cover - env availability failures are external
316
+ log_warning(f"Failed to reset preset={preset.value}: {exc}")
317
+ log_end(success=False, steps=steps_taken, score=score, rewards=rewards)
318
+ return EpisodeResult(
319
+ success=success, steps=steps_taken, score=score, rewards=rewards
320
+ )
321
+
322
  observation = result.observation
323
 
324
  while not observation.done and steps_taken < MAX_STEPS:
 
334
 
335
  try:
336
  result = await env.step(action)
337
+ except Exception as exc: # pragma: no cover - preserve log format and continue safely
338
+ fallback_action = heuristic_action(observation)
339
+ if fallback_action != action:
340
+ log_warning(
341
+ f"Step failed for preset={preset.value} with model action: {exc}. "
342
+ "Retrying with heuristic action."
343
+ )
344
+ action = fallback_action
345
+ try:
346
+ result = await env.step(action)
347
+ except Exception as retry_exc:
348
+ log_warning(
349
+ f"Step failed for preset={preset.value} even with heuristic action: {retry_exc}"
350
+ )
351
+ break
352
 
353
  observation = result.observation
354
  reward = float(result.reward or 0.0)
 
372
 
373
 
374
  async def main() -> None:
375
+ client, model_name = resolve_model_client()
 
 
 
 
 
 
376
 
377
  async with managed_env() as env:
378
  for index, preset in enumerate(PRESETS):
 
386
 
387
 
388
  if __name__ == "__main__":
389
+ try:
390
+ asyncio.run(main())
391
+ except Exception as exc: # pragma: no cover - final safeguard for validator stability
392
+ log_warning(f"Fatal inference error: {exc}")
openenv.yaml CHANGED
@@ -4,4 +4,13 @@ type: space
4
  runtime: fastapi
5
  app: server.app:app
6
  port: 8000
7
-
 
 
 
 
 
 
 
 
 
 
4
  runtime: fastapi
5
  app: server.app:app
6
  port: 8000
7
+ tasks:
8
+ - id: task_easy
9
+ description: Schedule a small, low-pressure dependency graph with high worker utilization.
10
+ grader: grade/task_easy
11
+ - id: task_medium
12
+ description: Balance utilization and deadlines on a moderately constrained workflow.
13
+ grader: grade/task_medium
14
+ - id: task_hard
15
+ description: Schedule under outages and retries while protecting critical work.
16
+ grader: grade/task_hard
pyproject.toml CHANGED
@@ -19,6 +19,7 @@ dependencies = [
19
  # "openenv-core[core] @ git+https://github.com/meta-pytorch/OpenEnv.git",
20
  "openenv-core[core]>=0.2.2",
21
  "gradio>=5.0.0",
 
22
  "plotly>=5.24.0",
23
  # Environment-specific dependencies
24
  # Add all dependencies needed for your environment here
 
19
  # "openenv-core[core] @ git+https://github.com/meta-pytorch/OpenEnv.git",
20
  "openenv-core[core]>=0.2.2",
21
  "gradio>=5.0.0",
22
+ "openai>=1.0.0",
23
  "plotly>=5.24.0",
24
  # Environment-specific dependencies
25
  # Add all dependencies needed for your environment here
server/workflow_arena_environment.py CHANGED
@@ -49,6 +49,8 @@ class WorkflowArenaEnvironment(Environment):
49
  UNFINISHED_PRIORITY_PENALTY: float = -0.02
50
  OVERDUE_PRIORITY_PENALTY_PER_TICK: float = -0.005
51
  MAX_RECENT_FAILURE_EVENTS: int = 6
 
 
52
 
53
  def __init__(self):
54
  self._state = State(episode_id=str(uuid4()), step_count=0)
@@ -100,13 +102,18 @@ class WorkflowArenaEnvironment(Environment):
100
  score = lower_bound / max(lower_bound, env_state.current_time)
101
  return round(score, 4)
102
 
 
 
 
 
 
 
103
  def _benchmark_score(self) -> float:
104
  makespan_score, deadline_score, utilization_score = self._grade_components(
105
  include_terminal_makespan=True
106
  )
107
- return round(
108
- (0.5 * makespan_score) + (0.3 * deadline_score) + (0.2 * utilization_score),
109
- 4,
110
  )
111
 
112
  def _grade_components(
 
49
  UNFINISHED_PRIORITY_PENALTY: float = -0.02
50
  OVERDUE_PRIORITY_PENALTY_PER_TICK: float = -0.005
51
  MAX_RECENT_FAILURE_EVENTS: int = 6
52
+ MIN_GRADER_SCORE: float = 0.01
53
+ MAX_GRADER_SCORE: float = 0.99
54
 
55
  def __init__(self):
56
  self._state = State(episode_id=str(uuid4()), step_count=0)
 
102
  score = lower_bound / max(lower_bound, env_state.current_time)
103
  return round(score, 4)
104
 
105
+ def _bounded_grader_score(self, score: float) -> float:
106
+ return round(
107
+ min(self.MAX_GRADER_SCORE, max(self.MIN_GRADER_SCORE, score)),
108
+ 4,
109
+ )
110
+
111
  def _benchmark_score(self) -> float:
112
  makespan_score, deadline_score, utilization_score = self._grade_components(
113
  include_terminal_makespan=True
114
  )
115
+ return self._bounded_grader_score(
116
+ (0.5 * makespan_score) + (0.3 * deadline_score) + (0.2 * utilization_score)
 
117
  )
118
 
119
  def _grade_components(
uv.lock CHANGED
@@ -1613,6 +1613,7 @@ version = "0.1.0"
1613
  source = { editable = "." }
1614
  dependencies = [
1615
  { name = "gradio" },
 
1616
  { name = "openenv-core", extra = ["core"] },
1617
  { name = "plotly" },
1618
  ]
@@ -1626,6 +1627,7 @@ dev = [
1626
  [package.metadata]
1627
  requires-dist = [
1628
  { name = "gradio", specifier = ">=5.0.0" },
 
1629
  { name = "openenv-core", extras = ["core"], specifier = ">=0.2.2" },
1630
  { name = "plotly", specifier = ">=5.24.0" },
1631
  { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0.0" },
 
1613
  source = { editable = "." }
1614
  dependencies = [
1615
  { name = "gradio" },
1616
+ { name = "openai" },
1617
  { name = "openenv-core", extra = ["core"] },
1618
  { name = "plotly" },
1619
  ]
 
1627
  [package.metadata]
1628
  requires-dist = [
1629
  { name = "gradio", specifier = ">=5.0.0" },
1630
+ { name = "openai", specifier = ">=1.0.0" },
1631
  { name = "openenv-core", extras = ["core"], specifier = ">=0.2.2" },
1632
  { name = "plotly", specifier = ">=5.24.0" },
1633
  { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0.0" },