Spaces:

Cyber-Machine
/

workflow_arena

Sleeping

App Files Files Community

Cyber-Machine commited on 8 days ago

Commit

b522b5c

verified ·

1 Parent(s): fecc757

feat: implement grading system with task definitions and score extraction

Browse files

Files changed (9) hide show

grade/common.py +106 -0
grade/task_easy +4 -0
grade/task_hard +4 -0
grade/task_medium +4 -0
inference.py +85 -18
openenv.yaml +10 -1
pyproject.toml +1 -0
server/workflow_arena_environment.py +10 -3
uv.lock +2 -0

grade/common.py ADDED Viewed

	@@ -0,0 +1,106 @@

+from __future__ import annotations
+import json
+import re
+import sys
+from pathlib import Path
+from typing import Any
+MIN_SCORE = 0.01
+MAX_SCORE = 0.99
+END_SCORE_RE = re.compile(r"\[END\].*?\bscore=([0-9]+(?:\.[0-9]+)?)")
+START_TASK_RE = re.compile(r"\[START\]\s+task=([^\s]+)")
+def clamp_score(score: float) -> float:
+    return round(min(MAX_SCORE, max(MIN_SCORE, score)), 4)
+def read_payload_text() -> str:
+    if len(sys.argv) > 1:
+        path = Path(sys.argv[1])
+        if path.exists():
+            return path.read_text()
+    return sys.stdin.read()
+def _lookup_score(value: Any) -> float | None:
+    if isinstance(value, (int, float)):
+        return float(value)
+    if isinstance(value, dict):
+        for key in (
+            "score",
+            "benchmark_score",
+            "final_score",
+            "task_score",
+        ):
+            candidate = value.get(key)
+            if isinstance(candidate, (int, float)):
+                return float(candidate)
+        for key in (
+            "success_metrics",
+            "observation",
+            "final_observation",
+            "result",
+            "metrics",
+        ):
+            candidate = value.get(key)
+            if candidate is not None:
+                nested = _lookup_score(candidate)
+                if nested is not None:
+                    return nested
+    if isinstance(value, list):
+        for item in value:
+            nested = _lookup_score(item)
+            if nested is not None:
+                return nested
+    return None
+def extract_score(text: str) -> float:
+    stripped = text.strip()
+    if not stripped:
+        return MIN_SCORE
+    match = END_SCORE_RE.search(stripped)
+    if match:
+        return clamp_score(float(match.group(1)))
+    try:
+        payload = json.loads(stripped)
+    except json.JSONDecodeError:
+        return MIN_SCORE
+    score = _lookup_score(payload)
+    if score is None:
+        return MIN_SCORE
+    return clamp_score(score)
+def extract_started_task(text: str) -> str | None:
+    match = START_TASK_RE.search(text)
+    if match:
+        return match.group(1)
+    return None
+def emit_grade(expected_task: str) -> int:
+    text = read_payload_text()
+    observed_task = extract_started_task(text)
+    score = extract_score(text)
+    if observed_task is not None and observed_task != expected_task:
+        score = MIN_SCORE
+    print(
+        json.dumps(
+            {
+                "task_id": expected_task,
+                "score": score,
+            },
+            separators=(",", ":"),
+        )
+    )
+    return 0

grade/task_easy ADDED Viewed

	@@ -0,0 +1,4 @@

+#!/usr/bin/env python3
+from common import emit_grade
+raise SystemExit(emit_grade("easy"))

grade/task_hard ADDED Viewed

	@@ -0,0 +1,4 @@

+#!/usr/bin/env python3
+from common import emit_grade
+raise SystemExit(emit_grade("hard"))

grade/task_medium ADDED Viewed

	@@ -0,0 +1,4 @@

+#!/usr/bin/env python3
+from common import emit_grade
+raise SystemExit(emit_grade("medium"))

inference.py CHANGED Viewed

@@ -26,6 +26,7 @@ PRESETS = [
 PROJECT_DIR = Path(__file__).resolve().parent
 IMAGE_NAME = "workflow-arena-inference:latest"
 DOCKERFILE_PATH = PROJECT_DIR / "server" / "Dockerfile"
 TEMPERATURE = 0.0
 MAX_STEPS = 256
@@ -65,6 +66,10 @@ def log_end(success: bool, steps: int, score: float, rewards: list[float]) -> No
     )
 def compact_task(task: WorkflowTaskView) -> dict[str, object]:
     return {
         "task_id": task.task_id,
@@ -172,6 +177,40 @@ def action_to_log_string(action: WorkflowArenaAction) -> str:
     return json.dumps(payload, separators=(",", ":"))
 def compute_score(observation: WorkflowArenaObservation) -> float:
     score = observation.benchmark_score
     if score is None:
@@ -233,12 +272,25 @@ def ensure_local_image() -> None:
 @asynccontextmanager
 async def managed_env():
     ensure_local_image()
     env = await WorkflowArenaEnv.from_docker_image(IMAGE_NAME)
     try:
         yield env
     finally:
-        await env.close()
 async def run_episode(
@@ -255,10 +307,18 @@ async def run_episode(
     log_start(task=preset.value, env=BENCHMARK, model=model_name)
-    result = await env.reset(
-        seed=seed,
-        preset=preset.value,
-    )
     observation = result.observation
     while not observation.done and steps_taken < MAX_STEPS:
@@ -274,11 +334,21 @@ async def run_episode(
         try:
             result = await env.step(action)
-        except (
-            Exception
-        ):  # pragma: no cover - preserve log format and continue safely
-            action = heuristic_action(observation)
-            result = await env.step(action)
         observation = result.observation
         reward = float(result.reward or 0.0)
@@ -302,13 +372,7 @@ async def run_episode(
 async def main() -> None:
-    api_base_url = os.environ["API_BASE_URL"]
-    model_name = os.environ["MODEL_NAME"]
-    api_key = os.getenv("HF_TOKEN") or os.getenv("OPENAI_API_KEY")
-    if not api_key:
-        raise RuntimeError("HF_TOKEN or OPENAI_API_KEY must be set.")
-    client = OpenAI(base_url=api_base_url, api_key=api_key)
     async with managed_env() as env:
         for index, preset in enumerate(PRESETS):
@@ -322,4 +386,7 @@ async def main() -> None:
 if __name__ == "__main__":
-    asyncio.run(main())

 PROJECT_DIR = Path(__file__).resolve().parent
 IMAGE_NAME = "workflow-arena-inference:latest"
 DOCKERFILE_PATH = PROJECT_DIR / "server" / "Dockerfile"
+DEFAULT_BASE_URL = os.getenv("WORKFLOW_ARENA_BASE_URL", "http://localhost:8000")
 TEMPERATURE = 0.0
 MAX_STEPS = 256
     )
+def log_warning(message: str) -> None:
+    print(f"[WARN] {message}", flush=True)
 def compact_task(task: WorkflowTaskView) -> dict[str, object]:
     return {
         "task_id": task.task_id,
     return json.dumps(payload, separators=(",", ":"))
+def resolve_model_client() -> tuple[OpenAI | None, str]:
+    api_base_url = os.getenv("API_BASE_URL")
+    model_name = os.getenv("MODEL_NAME")
+    api_key = (
+        os.getenv("API_KEY")
+        or os.getenv("HF_TOKEN")
+        or os.getenv("OPENAI_API_KEY")
+    )
+    missing = []
+    if not api_base_url:
+        missing.append("API_BASE_URL")
+    if not model_name:
+        missing.append("MODEL_NAME")
+    if not api_key:
+        missing.append("API_KEY")
+    if missing:
+        log_warning(
+            "Missing model configuration ("
+            + ", ".join(missing)
+            + "). Falling back to heuristic policy."
+        )
+        return None, "heuristic"
+    try:
+        return OpenAI(base_url=api_base_url, api_key=api_key), model_name
+    except Exception as exc:  # pragma: no cover - defensive initialization fallback
+        log_warning(
+            f"Failed to initialize model client: {exc}. Falling back to heuristic policy."
+        )
+        return None, "heuristic"
 def compute_score(observation: WorkflowArenaObservation) -> float:
     score = observation.benchmark_score
     if score is None:
 @asynccontextmanager
 async def managed_env():
+    try:
+        async with WorkflowArenaEnv(base_url=DEFAULT_BASE_URL) as env:
+            yield env
+            return
+    except Exception as exc:
+        log_warning(
+            f"Failed to connect to environment at {DEFAULT_BASE_URL}: {exc}. "
+            "Trying local Docker fallback."
+        )
     ensure_local_image()
     env = await WorkflowArenaEnv.from_docker_image(IMAGE_NAME)
     try:
         yield env
     finally:
+        try:
+            await env.close()
+        except Exception as exc:  # pragma: no cover - teardown failures should not fail inference
+            log_warning(f"Failed to close Docker environment cleanly: {exc}")
 async def run_episode(
     log_start(task=preset.value, env=BENCHMARK, model=model_name)
+    try:
+        result = await env.reset(
+            seed=seed,
+            preset=preset.value,
+        )
+    except Exception as exc:  # pragma: no cover - env availability failures are external
+        log_warning(f"Failed to reset preset={preset.value}: {exc}")
+        log_end(success=False, steps=steps_taken, score=score, rewards=rewards)
+        return EpisodeResult(
+            success=success, steps=steps_taken, score=score, rewards=rewards
+        )
     observation = result.observation
     while not observation.done and steps_taken < MAX_STEPS:
         try:
             result = await env.step(action)
+        except Exception as exc:  # pragma: no cover - preserve log format and continue safely
+            fallback_action = heuristic_action(observation)
+            if fallback_action != action:
+                log_warning(
+                    f"Step failed for preset={preset.value} with model action: {exc}. "
+                    "Retrying with heuristic action."
+                )
+            action = fallback_action
+            try:
+                result = await env.step(action)
+            except Exception as retry_exc:
+                log_warning(
+                    f"Step failed for preset={preset.value} even with heuristic action: {retry_exc}"
+                )
+                break
         observation = result.observation
         reward = float(result.reward or 0.0)
 async def main() -> None:
+    client, model_name = resolve_model_client()
     async with managed_env() as env:
         for index, preset in enumerate(PRESETS):
 if __name__ == "__main__":
+    try:
+        asyncio.run(main())
+    except Exception as exc:  # pragma: no cover - final safeguard for validator stability
+        log_warning(f"Fatal inference error: {exc}")

openenv.yaml CHANGED Viewed

@@ -4,4 +4,13 @@ type: space
 runtime: fastapi
 app: server.app:app
 port: 8000

 runtime: fastapi
 app: server.app:app
 port: 8000
+tasks:
+  - id: task_easy
+    description: Schedule a small, low-pressure dependency graph with high worker utilization.
+    grader: grade/task_easy
+  - id: task_medium
+    description: Balance utilization and deadlines on a moderately constrained workflow.
+    grader: grade/task_medium
+  - id: task_hard
+    description: Schedule under outages and retries while protecting critical work.
+    grader: grade/task_hard

pyproject.toml CHANGED Viewed

@@ -19,6 +19,7 @@ dependencies = [
     # "openenv-core[core] @ git+https://github.com/meta-pytorch/OpenEnv.git",
     "openenv-core[core]>=0.2.2",
     "gradio>=5.0.0",
     "plotly>=5.24.0",
     # Environment-specific dependencies
     # Add all dependencies needed for your environment here

     # "openenv-core[core] @ git+https://github.com/meta-pytorch/OpenEnv.git",
     "openenv-core[core]>=0.2.2",
     "gradio>=5.0.0",
+    "openai>=1.0.0",
     "plotly>=5.24.0",
     # Environment-specific dependencies
     # Add all dependencies needed for your environment here

server/workflow_arena_environment.py CHANGED Viewed

@@ -49,6 +49,8 @@ class WorkflowArenaEnvironment(Environment):
     UNFINISHED_PRIORITY_PENALTY: float = -0.02
     OVERDUE_PRIORITY_PENALTY_PER_TICK: float = -0.005
     MAX_RECENT_FAILURE_EVENTS: int = 6
     def __init__(self):
         self._state = State(episode_id=str(uuid4()), step_count=0)
@@ -100,13 +102,18 @@ class WorkflowArenaEnvironment(Environment):
         score = lower_bound / max(lower_bound, env_state.current_time)
         return round(score, 4)
     def _benchmark_score(self) -> float:
         makespan_score, deadline_score, utilization_score = self._grade_components(
             include_terminal_makespan=True
         )
-        return round(
-            (0.5 * makespan_score) + (0.3 * deadline_score) + (0.2 * utilization_score),
-            4,
         )
     def _grade_components(

     UNFINISHED_PRIORITY_PENALTY: float = -0.02
     OVERDUE_PRIORITY_PENALTY_PER_TICK: float = -0.005
     MAX_RECENT_FAILURE_EVENTS: int = 6
+    MIN_GRADER_SCORE: float = 0.01
+    MAX_GRADER_SCORE: float = 0.99
     def __init__(self):
         self._state = State(episode_id=str(uuid4()), step_count=0)
         score = lower_bound / max(lower_bound, env_state.current_time)
         return round(score, 4)
+    def _bounded_grader_score(self, score: float) -> float:
+        return round(
+            min(self.MAX_GRADER_SCORE, max(self.MIN_GRADER_SCORE, score)),
+            4,
+        )
     def _benchmark_score(self) -> float:
         makespan_score, deadline_score, utilization_score = self._grade_components(
             include_terminal_makespan=True
         )
+        return self._bounded_grader_score(
+            (0.5 * makespan_score) + (0.3 * deadline_score) + (0.2 * utilization_score)
         )
     def _grade_components(

uv.lock CHANGED Viewed

@@ -1613,6 +1613,7 @@ version = "0.1.0"
 source = { editable = "." }
 dependencies = [
     { name = "gradio" },
     { name = "openenv-core", extra = ["core"] },
     { name = "plotly" },
 ]
@@ -1626,6 +1627,7 @@ dev = [
 [package.metadata]
 requires-dist = [
     { name = "gradio", specifier = ">=5.0.0" },
     { name = "openenv-core", extras = ["core"], specifier = ">=0.2.2" },
     { name = "plotly", specifier = ">=5.24.0" },
     { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0.0" },

 source = { editable = "." }
 dependencies = [
     { name = "gradio" },
+    { name = "openai" },
     { name = "openenv-core", extra = ["core"] },
     { name = "plotly" },
 ]
 [package.metadata]
 requires-dist = [
     { name = "gradio", specifier = ">=5.0.0" },
+    { name = "openai", specifier = ">=1.0.0" },
     { name = "openenv-core", extras = ["core"], specifier = ">=0.2.2" },
     { name = "plotly", specifier = ">=5.24.0" },
     { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0.0" },