diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..4c49bd78f1d08f2bc09fa0bd8191ed38b7dce5e3 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.env diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..75fe30d220e611f91c69e327ce7fe65771f10e3e --- /dev/null +++ b/Dockerfile @@ -0,0 +1,18 @@ +FROM python:3.10-slim + +WORKDIR /app + +# Copy all files +COPY . . + +# Install dependencies +RUN pip install --no-cache-dir -r requirements.txt + +# Install extra needed libs +RUN pip install pyyaml + +# Set environment +ENV PYTHONPATH=/app + +# Default command +CMD ["uvicorn", "app_server:app", "--host", "0.0.0.0", "--port", "7860"] \ No newline at end of file diff --git a/README.md b/README.md index 5ed2c48fbc223b5a48ef25c6c704b86594a95c85..155792416883852216bec4c916cc68da0bd6c73f 100644 --- a/README.md +++ b/README.md @@ -7,5 +7,235 @@ sdk: docker pinned: false license: mit --- +# 🧠 OpenEnv Workflow Agent β€” Decision-Making Under Uncertainty + +## πŸš€ Overview + +We present a **real-world OpenEnv environment** that simulates workflow management tasks such as email triage, scheduling, and task handling under **partial observability**. + +Unlike typical environments, this benchmark focuses on a critical but underexplored capability: + +> πŸ”₯ **Cost-aware information gathering in sequential decision-making** + +Agents must decide: +- When to act immediately +- When to request additional information +- Whether the cost of uncertainty reduction is justified + +--- + +## 🎯 Why This Matters + +Modern AI agents (LLMs, assistants, copilots) operate in **uncertain environments**: +- Emails are ambiguous +- User intent is hidden +- Context is incomplete + +Our environment models this realistically by enforcing: + +- ❗ Incorrect actions under uncertainty β†’ penalized +- ❗ Information gathering β†’ beneficial but costly +- ❗ Multi-step reasoning required for optimal decisions + +--- + +## 🧠 Core Idea + +We introduce a **POMDP-style workflow environment** where: + +- The true state is partially hidden +- Agents must **actively reduce uncertainty** +- Information acquisition has a **non-zero cost** + +### Key Property: + +> An optimal agent follows: +> +> **β€œRequest information only when expected benefit exceeds cost.”** + +--- + +## βš™οΈ Environment Design + +### πŸ”Ή State + +- Emails (observed) +- Tasks & calendar (observed) +- Hidden attributes: + - true intent + - urgency + - missing information + +--- + +### πŸ”Ή Actions + +- `classify` +- `reply` +- `schedule` +- `request_info` +- `archive` +- `prioritize` + +--- + +### πŸ”Ή Reward Function + +\[ +r_t = r_{correct} + r_{progress} - r_{cost} - r_{penalty} +\] + +- Correct action β†’ +0.3 +- Task progress β†’ +0.2 +- Step penalty β†’ βˆ’0.01 +- Information request cost β†’ βˆ’0.05 +- Incorrect action β†’ βˆ’0.2 + +--- + +## πŸ§ͺ Tasks + +### 🟒 Easy +- Clear intent +- Single-step decision + +### 🟑 Medium +- Multi-step workflow +- Requires sequencing + +### πŸ”΄ Hard +- Ambiguous input +- Requires **information gathering before acting** + +--- + +## πŸ“Š Baseline Results + +``` + +easy: 1.00 +medium: 0.50 +hard: 0.13 + +``` + +### πŸ” Interpretation + +- Baseline performs well on simple tasks +- Fails on ambiguous scenarios +- Demonstrates need for **information-aware policies** + +--- + +## πŸ”₯ Key Insight + +Standard agents fail because they **act too early under uncertainty**. + +Agents that act immediately under uncertainty fail. +Agents that strategically gather information succeed. + +This environment makes that tradeoff explicit and measurable. + +Our environment exposes this failure mode clearly. + +--- + +## 🧩 Novel Contribution + +We introduce: + +### βœ… Cost-sensitive information gathering +- Asking questions is beneficial but not free + +### βœ… Enforced uncertainty +- Actions without information are penalized + +### βœ… Sequential dependency +- Early decisions affect future rewards + +--- + +## πŸ§ͺ Validation + +We verify: + +- βœ” Classification fails under missing information +- βœ” Requesting info enables correct decisions +- βœ” Tradeoff emerges between cost and accuracy + +--- + +## πŸ“¦ Project Structure + +``` + +app/ +tasks/ +graders/ +baseline/ +scripts/ +openenv.yaml +Dockerfile +inference.py + +```` + +--- + +## ▢️ Run Locally + +You can pull the pre-built Docker image directly from Docker Hub and run it: + +```bash +docker pull imsachin010/openenv-workflow-agent:latest +docker run -d -p 7860:7860 --name openenv-agent imsachin010/openenv-workflow-agent:latest +``` + +Test endpoint: + +```bash +curl -X POST http://localhost:7860/reset +``` + +--- + +## πŸ€– Inference + +Run the inference script inside the environment: + +```bash +python -m inference +``` + +Outputs: + +``` +[START] +[STEP] +[END] +``` + +--- + +## 🧠 Conclusion + +This environment highlights a key gap in current agents: + +> ❗ They do not reason about **when to gather information** + +We provide a benchmark to evaluate and improve: + +* decision-making under uncertainty +* information-seeking behavior +* sequential reasoning + +--- + +## 🏁 Submission Notes + +* βœ” Fully OpenEnv compliant +* βœ” Deterministic graders +* βœ” Reproducible via Docker +* βœ” HF Space endpoint available + -Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference diff --git a/__pycache__/normaltest.cpython-313.pyc b/__pycache__/normaltest.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..98c46222486ce816776aa688bac20413a8a77ba7 Binary files /dev/null and b/__pycache__/normaltest.cpython-313.pyc differ diff --git a/app/__init__.py b/app/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/app/__pycache__/__init__.cpython-313.pyc b/app/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0239537966482d1c0d43edb17489046666a5b18b Binary files /dev/null and b/app/__pycache__/__init__.cpython-313.pyc differ diff --git a/app/__pycache__/actions.cpython-313.pyc b/app/__pycache__/actions.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0569fe0492b97f01cb8ea439f8595dbb907aef00 Binary files /dev/null and b/app/__pycache__/actions.cpython-313.pyc differ diff --git a/app/__pycache__/env.cpython-313.pyc b/app/__pycache__/env.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b8f5d4e6393027d474731890cdc37d5b9171b7f5 Binary files /dev/null and b/app/__pycache__/env.cpython-313.pyc differ diff --git a/app/__pycache__/observation.cpython-313.pyc b/app/__pycache__/observation.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3f5c94684c2f596015697e15b9737421d59d1234 Binary files /dev/null and b/app/__pycache__/observation.cpython-313.pyc differ diff --git a/app/__pycache__/reward.cpython-313.pyc b/app/__pycache__/reward.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..45f167d9a0dc02357a8a85c67abe17b959e4a10f Binary files /dev/null and b/app/__pycache__/reward.cpython-313.pyc differ diff --git a/app/__pycache__/state.cpython-313.pyc b/app/__pycache__/state.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..01facf123d573c0213ba24869bd7326304c5ab63 Binary files /dev/null and b/app/__pycache__/state.cpython-313.pyc differ diff --git a/app/__pycache__/transition.cpython-313.pyc b/app/__pycache__/transition.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c401e4d6bcb8f189247ed91a1307e71b04d4edd6 Binary files /dev/null and b/app/__pycache__/transition.cpython-313.pyc differ diff --git a/app/actions.py b/app/actions.py new file mode 100644 index 0000000000000000000000000000000000000000..af917b7fe60e7c6ea07fe06bad7d1ef59b9dfac3 --- /dev/null +++ b/app/actions.py @@ -0,0 +1,18 @@ +from pydantic import BaseModel +from typing import Optional, Literal, Dict + + +ActionType = Literal[ + "classify", + "reply", + "schedule", + "prioritize", + "request_info", + "archive" +] + + +class Action(BaseModel): + type: ActionType + target_id: str # email/task id + payload: Optional[Dict] = None # flexible for different actions \ No newline at end of file diff --git a/app/env.py b/app/env.py new file mode 100644 index 0000000000000000000000000000000000000000..7d626dd746eeca1b535d3139b6de11febfb299d0 --- /dev/null +++ b/app/env.py @@ -0,0 +1,67 @@ +from typing import Tuple, Dict, Any +from copy import deepcopy + +from .state import EnvironmentState +from .observation import Observation +from .actions import Action +from .transition import apply_action +from .reward import compute_reward + + +class WorkflowEnv: + def __init__(self, initial_state: EnvironmentState): + self.initial_state = deepcopy(initial_state) + self._state = deepcopy(initial_state) + + # ----------------------------- + # RESET + # ----------------------------- + def reset(self) -> Observation: + self._state = deepcopy(self.initial_state) + return self._get_observation() + + # ----------------------------- + # STEP + # ----------------------------- + def step(self, action: Action) -> Tuple[Observation, float, bool, Dict[str, Any]]: + if self._state.done: + raise Exception("Episode already finished. Call reset().") + + # Log action + self._state.history.append({ + "timestep": self._state.timestep, + "action": action.model_dump() + }) + + # βœ… APPLY TRANSITION (NEW) + self._state, info = apply_action(self._state, action) + + # βœ… COMPUTE REWARD (NEW) + reward = compute_reward(self._state, action.type, info) + + # Increment timestep + self._state.timestep += 1 + + # Episode termination + if self._state.timestep >= 10: + self._state.done = True + + return self._get_observation(), reward, self._state.done, {} + + # ----------------------------- + # STATE ACCESS + # ----------------------------- + def state(self) -> EnvironmentState: + return self._state + + # ----------------------------- + # OBSERVATION + # ----------------------------- + def _get_observation(self) -> Observation: + return Observation( + emails=self._state.emails, + tasks=self._state.tasks, + calendar=self._state.calendar, + history=self._state.history, + timestep=self._state.timestep + ) \ No newline at end of file diff --git a/app/observation.py b/app/observation.py new file mode 100644 index 0000000000000000000000000000000000000000..8e6dc51fde7502f1bf774b227c1e6fb7f6c9d243 --- /dev/null +++ b/app/observation.py @@ -0,0 +1,11 @@ +from pydantic import BaseModel +from typing import List, Dict +from .state import Email, Task, CalendarEvent + + +class Observation(BaseModel): + emails: List[Email] + tasks: List[Task] + calendar: List[CalendarEvent] + history: List[Dict] + timestep: int \ No newline at end of file diff --git a/app/reward.py b/app/reward.py new file mode 100644 index 0000000000000000000000000000000000000000..da25ed8ee87d50fb20f6191efe9fd9f4ab026718 --- /dev/null +++ b/app/reward.py @@ -0,0 +1,29 @@ +from app.state import EnvironmentState + + +def compute_reward(state: EnvironmentState, action_type: str, info: dict) -> float: + reward = 0.0 + + # --- Correctness --- + if info.get("correct_action"): + reward += 0.2 + + # Cost for asking info (tradeoff) + if action_type == "request_info": + reward -= 0.05 # cost for querying + elif info.get("incorrect_action"): + reward -= 0.2 + + # --- Progress --- + if info.get("task_progress"): + reward += 0.2 + + # --- Step penalty (efficiency) + reward -= 0.01 + + # --- Deadline penalty --- + for hidden in state.hidden_email_states: + if hidden.deadline and state.timestep > hidden.deadline: + reward -= 0.5 + + return reward \ No newline at end of file diff --git a/app/state.py b/app/state.py new file mode 100644 index 0000000000000000000000000000000000000000..df998e15147cc8d18c9f004b9cf74b9d7411d3ed --- /dev/null +++ b/app/state.py @@ -0,0 +1,55 @@ +from pydantic import BaseModel, Field +from typing import List, Optional, Dict +from enum import Enum + + +class EmailPriority(str, Enum): + LOW = "low" + MEDIUM = "medium" + HIGH = "high" + + +class Email(BaseModel): + id: str + sender: str + subject: str + body: str + + +class HiddenEmailState(BaseModel): + email_id: str + true_intent: str # e.g., "meeting_request", "spam", "task" + urgency: EmailPriority + requires_response: bool + deadline: Optional[int] # timestep deadline + missing_information: bool # does agent need to ask clarification? + + +class Task(BaseModel): + id: str + description: str + completed: bool = False + deadline: Optional[int] + + +class CalendarEvent(BaseModel): + id: str + title: str + time: int + + +class EnvironmentState(BaseModel): + # Observed components + emails: List[Email] + tasks: List[Task] + calendar: List[CalendarEvent] + history: List[Dict] = Field(default_factory=list) + + # Hidden components (NOT exposed to agent) + hidden_email_states: List[HiddenEmailState] + + # Global timestep + timestep: int = 0 + + # Episode termination + done: bool = False \ No newline at end of file diff --git a/app/transition.py b/app/transition.py new file mode 100644 index 0000000000000000000000000000000000000000..9b988c8773d9c9b23b6668fc64b9f2dcd1fc5f95 --- /dev/null +++ b/app/transition.py @@ -0,0 +1,77 @@ +from app.state import EnvironmentState + + +def apply_action(state: EnvironmentState, action): + info = { + "correct_action": False, + "incorrect_action": False, + "task_progress": False + } + + # Find hidden truth + hidden = next( + (h for h in state.hidden_email_states if h.email_id == action.target_id), + None + ) + # ---------------------------- + # CLASSIFY + # ---------------------------- + # if action.type == "classify": + # predicted = action.payload.get("label") if action.payload else None + + # if hidden: + # # πŸ”₯ NEW: penalize guessing under uncertainty + # if hidden.missing_information: + # info["incorrect_action"] = True # cannot classify correctly without info + + # elif predicted == hidden.true_intent: + # info["correct_action"] = True + # info["task_progress"] = True + + # else: + # info["incorrect_action"] = True + if action.type == "classify": + predicted = action.payload.get("label") if action.payload else None + + if not hidden: + info["incorrect_action"] = True + + elif hidden.missing_information: + # ❌ Cannot classify without info + info["incorrect_action"] = True + + else: + # βœ… Now classification is allowed + if predicted == hidden.true_intent: + info["correct_action"] = True + info["task_progress"] = True + else: + info["incorrect_action"] = True + # ---------------------------- + # ARCHIVE + # ---------------------------- + elif action.type == "archive": + state.emails = [e for e in state.emails if e.id != action.target_id] + info["task_progress"] = True + + # ---------------------------- + # REQUEST INFO + # ---------------------------- + elif action.type == "request_info": + if hidden and hidden.missing_information: + hidden.missing_information = False + info["correct_action"] = True + else: + info["incorrect_action"] = True + + # ---------------------------- + # REPLY + # ---------------------------- + elif action.type == "reply": + if hidden and hidden.requires_response: + hidden.requires_response = False + info["correct_action"] = True + else: + info["incorrect_action"] = True + + return state, info \ No newline at end of file diff --git a/app/utils.py b/app/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/app_server.py b/app_server.py new file mode 100644 index 0000000000000000000000000000000000000000..772399fa7ddf2c3bfa33dfdea77c4c8db950faed --- /dev/null +++ b/app_server.py @@ -0,0 +1,22 @@ +from fastapi import FastAPI +from app.env import WorkflowEnv +from tasks.easy import create_easy_task + +app = FastAPI() + + +@app.post("/reset") +def reset(): + state, _ = create_easy_task() + env = WorkflowEnv(state) + obs = env.reset() + + return {"status": "ok"} + +@app.get("/") +def root(): + return {"message": "Workflow Env is running"} + +@app.get("/") +def home(): + return {"message": "Workflow Env API running"} \ No newline at end of file diff --git a/baseline/__init__.py b/baseline/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/baseline/__pycache__/__init__.cpython-313.pyc b/baseline/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8c6d43c8a97654883bff28602795ea981ea8c75d Binary files /dev/null and b/baseline/__pycache__/__init__.cpython-313.pyc differ diff --git a/baseline/__pycache__/policy.cpython-313.pyc b/baseline/__pycache__/policy.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fbed7c0780e0a378a71c6e22044caf3569409889 Binary files /dev/null and b/baseline/__pycache__/policy.cpython-313.pyc differ diff --git a/baseline/__pycache__/run_baseline.cpython-313.pyc b/baseline/__pycache__/run_baseline.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..69f9d6a24750c2c0c0e79a69e9205cbc866fae60 Binary files /dev/null and b/baseline/__pycache__/run_baseline.cpython-313.pyc differ diff --git a/baseline/policy.py b/baseline/policy.py new file mode 100644 index 0000000000000000000000000000000000000000..8d9e545f59dd192519b75203b272344c0898fd2b --- /dev/null +++ b/baseline/policy.py @@ -0,0 +1,30 @@ +from app.actions import Action + + +class BaselinePolicy: + def act(self, observation): + if not observation.emails: + return None + + email = observation.emails[0] + text = (email.subject + " " + email.body).lower() + + # Heuristic rules + if "meet" in text: + return Action( + type="classify", + target_id=email.id, + payload={"label": "meeting_request"} + ) + + elif "report" in text or "update" in text: + return Action( + type="classify", + target_id=email.id, + payload={"label": "task_request"} + ) + + return Action( + type="archive", + target_id=email.id + ) \ No newline at end of file diff --git a/baseline/run_baseline.py b/baseline/run_baseline.py new file mode 100644 index 0000000000000000000000000000000000000000..c5c50ef593c5c70c16fa4f26facca6fa59286fcf --- /dev/null +++ b/baseline/run_baseline.py @@ -0,0 +1,54 @@ +from tasks.easy import create_easy_task +from tasks.medium import create_medium_task +from tasks.hard import create_hard_task + +from graders.easy_grader import EasyGrader +from graders.medium_grader import MediumGrader +from graders.hard_grader import HardGrader + +from app.env import WorkflowEnv +from baseline.policy import BaselinePolicy + + +def run_task(task_name, create_task_fn, grader_cls): + state, ground_truth = create_task_fn() + env = WorkflowEnv(state) + policy = BaselinePolicy() + + obs = env.reset() + + done = False + steps = 0 + + while not done and steps < 10: + action = policy.act(obs) + + if action is None: + break + + obs, reward, done, _ = env.step(action) + steps += 1 + + trajectory = env.state().history + print(f"{task_name} trajectory:", trajectory) + + grader = grader_cls() + score = grader.grade(trajectory, ground_truth) + + return score + + +def main(): + results = {} + + results["easy"] = run_task("easy", create_easy_task, EasyGrader) + results["medium"] = run_task("medium", create_medium_task, MediumGrader) + results["hard"] = run_task("hard", create_hard_task, HardGrader) + + print("\n===== BASELINE RESULTS =====") + for k, v in results.items(): + print(f"{k}: {round(v, 3)}") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/graders/__init__.py b/graders/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/graders/__pycache__/__init__.cpython-313.pyc b/graders/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..632627e59ab2858591f90973b5197211fb0f6b94 Binary files /dev/null and b/graders/__pycache__/__init__.cpython-313.pyc differ diff --git a/graders/__pycache__/base.cpython-313.pyc b/graders/__pycache__/base.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d585157748337fdb7ad76bea8c8e3b2d59db1d9e Binary files /dev/null and b/graders/__pycache__/base.cpython-313.pyc differ diff --git a/graders/__pycache__/easy_grader.cpython-313.pyc b/graders/__pycache__/easy_grader.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ee73d41f4660847223b57e503a016a23cd98b465 Binary files /dev/null and b/graders/__pycache__/easy_grader.cpython-313.pyc differ diff --git a/graders/__pycache__/hard_grader.cpython-313.pyc b/graders/__pycache__/hard_grader.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ae4123385c67cea1056b13083de93c89811cc672 Binary files /dev/null and b/graders/__pycache__/hard_grader.cpython-313.pyc differ diff --git a/graders/__pycache__/medium_grader.cpython-313.pyc b/graders/__pycache__/medium_grader.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f2c0c2c93621d23f23a4e51088613f043e1569ad Binary files /dev/null and b/graders/__pycache__/medium_grader.cpython-313.pyc differ diff --git a/graders/base.py b/graders/base.py new file mode 100644 index 0000000000000000000000000000000000000000..c379e01430eef5d4ff594f394a6e170484963dbe --- /dev/null +++ b/graders/base.py @@ -0,0 +1,3 @@ +class BaseGrader: + def grade(self, trajectory, ground_truth) -> float: + raise NotImplementedError \ No newline at end of file diff --git a/graders/easy_grader.py b/graders/easy_grader.py new file mode 100644 index 0000000000000000000000000000000000000000..125f6a04d0c7b2c8c965bd2ba298160aaa8c1cdd --- /dev/null +++ b/graders/easy_grader.py @@ -0,0 +1,17 @@ +from graders.base import BaseGrader + + +class EasyGrader(BaseGrader): + def grade(self, trajectory, ground_truth) -> float: + correct_label = ground_truth["label"] + + for step in trajectory: + action = step["action"] + + if action["type"] == "classify": + if action.get("payload", {}).get("label") == correct_label: + return 1.0 + else: + return 0.0 + + return 0.0 \ No newline at end of file diff --git a/graders/hard_grader.py b/graders/hard_grader.py new file mode 100644 index 0000000000000000000000000000000000000000..6de8d041663ecbc2fc87f84743674775ba304dc6 --- /dev/null +++ b/graders/hard_grader.py @@ -0,0 +1,26 @@ +from graders.base import BaseGrader + + +class HardGrader(BaseGrader): + def grade(self, trajectory, ground_truth) -> float: + expected_sequence = ground_truth["sequence"] + + matched = 0 + penalty = 0 + + for i, step in enumerate(trajectory): + if i >= len(expected_sequence): + break + + action = step["action"] + expected = expected_sequence[i] + + if action["type"] == expected["type"]: + matched += 1 + else: + penalty += 1 + + score = matched / len(expected_sequence) + score -= 0.1 * penalty + + return max(0.0, min(1.0, score)) \ No newline at end of file diff --git a/graders/medium_grader.py b/graders/medium_grader.py new file mode 100644 index 0000000000000000000000000000000000000000..b00ab702dc788de3e18cf9d7f859289e2cfe907d --- /dev/null +++ b/graders/medium_grader.py @@ -0,0 +1,22 @@ +from graders.base import BaseGrader + + +class MediumGrader(BaseGrader): + def grade(self, trajectory, ground_truth) -> float: + expected_sequence = ground_truth["sequence"] + + score = 0.0 + matched = 0 + + for i, step in enumerate(trajectory): + if i >= len(expected_sequence): + break + + action = step["action"] + expected = expected_sequence[i] + + if action["type"] == expected["type"]: + matched += 1 + + score = matched / len(expected_sequence) + return score \ No newline at end of file diff --git a/inference.py b/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..8dc2ad395cc80121227c647b38fa8c8ea5648c4a --- /dev/null +++ b/inference.py @@ -0,0 +1,114 @@ +import os +from dotenv import load_dotenv +from openai import OpenAI + +load_dotenv() + +from app.env import WorkflowEnv +from app.actions import Action +from tasks.hard import create_hard_task +from graders.hard_grader import HardGrader +# ---------------- ENV CONFIG ---------------- +API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1") +MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct") +HF_TOKEN = os.getenv("HF_TOKEN") +LOCAL_IMAGE_NAME = os.getenv("LOCAL_IMAGE_NAME") + +client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN) + + +# ---------------- LOGGING ---------------- +def log_start(task, env, model): + print(f"[START] task={task} env={env} model={model}", flush=True) + + +def log_step(step, action, reward, done, error): + print( + f"[STEP] step={step} action={action} reward={reward:.2f} done={str(done).lower()} error={error or 'null'}", + flush=True, + ) + + +def log_end(success, steps, score, rewards): + rewards_str = ",".join(f"{r:.2f}" for r in rewards) + print( + f"[END] success={str(success).lower()} steps={steps} score={score:.2f} rewards={rewards_str}", + flush=True, + ) + + +# ---------------- SIMPLE POLICY ---------------- +def get_action(obs): + if not obs.emails: + return None + + email = obs.emails[0] + + # πŸ”₯ IMPORTANT: detect if we already asked info + already_asked = any( + h["action"]["type"] == "request_info" + for h in obs.history + ) + + text = (email.subject + " " + email.body).lower() + + # If info already requested β†’ do NOT ask again + if already_asked: + return Action( + type="classify", + target_id=email.id, + payload={"label": "meeting_request"} + ) + + # First step: ask info if ambiguous + if "sometime" in text or "next week" in text: + return Action(type="request_info", target_id=email.id) + + return Action(type="archive", target_id=email.id) + + +# ---------------- MAIN ---------------- +def main(): + state, gt = create_hard_task() + env = WorkflowEnv(state) + grader = HardGrader() + + obs = env.reset() + + rewards = [] + steps = 0 + + log_start("hard", "workflow-env", MODEL_NAME) + + try: + done = False + + while not done and steps < 10: + action = get_action(obs) + if action is None: + break + + obs, reward, done, _ = env.step(action) + + rewards.append(reward) + steps += 1 + + log_step(steps, action.type, reward, done, None) + + # πŸ”₯ STOP CONDITION (IMPORTANT) + if action.type == "classify": + break + + trajectory = env.state().history + score = grader.grade(trajectory, gt) + + score = max(0.0, min(1.0, score)) + + success = score > 0.3 + + finally: + log_end(success, steps, score, rewards) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/normaltest.py b/normaltest.py new file mode 100644 index 0000000000000000000000000000000000000000..a13a002bf2cefc0ab5df3762555c290a0ccc0a84 --- /dev/null +++ b/normaltest.py @@ -0,0 +1,21 @@ +from tasks.easy import create_easy_task +from app.env import WorkflowEnv +from app.actions import Action + +state, gt = create_easy_task() +env = WorkflowEnv(state) + +obs = env.reset() +print("Initial:", obs) + +# Try correct classify +action = Action( + type="classify", + target_id="1", + payload={"label": "meeting_request"} +) + +obs, reward, done, _ = env.step(action) + +print("After step:", obs) +print("Reward:", reward) \ No newline at end of file diff --git a/openenv.yaml b/openenv.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1cea30e79117775c31534a5820faac350c2462ee --- /dev/null +++ b/openenv.yaml @@ -0,0 +1,53 @@ +name: workflow-agent-env +description: > + A real-world environment simulating email and workflow management under partial observability. + Agents must classify, respond, and manage tasks with incomplete information. + +version: "1.0" + +entry_point: app.env:WorkflowEnv + +observation_space: + type: object + properties: + emails: + type: array + description: List of emails in inbox + tasks: + type: array + calendar: + type: array + history: + type: array + timestep: + type: integer + +action_space: + type: object + properties: + type: + type: string + enum: + - classify + - reply + - schedule + - prioritize + - request_info + - archive + target_id: + type: string + payload: + type: object + +tasks: + - name: easy + generator: tasks.easy:create_easy_task + grader: graders.easy_grader:EasyGrader + + - name: medium + generator: tasks.medium:create_medium_task + grader: graders.medium_grader:MediumGrader + + - name: hard + generator: tasks.hard:create_hard_task + grader: graders.hard_grader:HardGrader \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..84f3970779ecf33c537ca1cf57b3b72ce6d39168 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +pydantic==2.7.1 +typing-extensions +python-dotenv +pytest +pyyaml +fastapi +uvicorn +openai \ No newline at end of file diff --git a/scripts/__init__.py b/scripts/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/scripts/__pycache__/__init__.cpython-313.pyc b/scripts/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f8408cf0b5e17566a232958a7afba3481a8b6dcd Binary files /dev/null and b/scripts/__pycache__/__init__.cpython-313.pyc differ diff --git a/scripts/__pycache__/validate_env.cpython-313.pyc b/scripts/__pycache__/validate_env.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..30b001446e261a5d51e2e81e49609fcd4529db51 Binary files /dev/null and b/scripts/__pycache__/validate_env.cpython-313.pyc differ diff --git a/scripts/run_all_tasks.py b/scripts/run_all_tasks.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/scripts/validate_env.py b/scripts/validate_env.py new file mode 100644 index 0000000000000000000000000000000000000000..a4dca66f1abe2706fa9afd1486c1adf24cbc4fbd --- /dev/null +++ b/scripts/validate_env.py @@ -0,0 +1,35 @@ +import importlib +import yaml + + +def validate_yaml(): + with open("openenv.yaml", "r") as f: + config = yaml.safe_load(f) + + print("βœ” YAML loaded") + + # Check entry point + module_name, class_name = config["entry_point"].split(":") + module = importlib.import_module(module_name) + getattr(module, class_name) + + print("βœ” Entry point valid") + + # Check tasks + for task in config["tasks"]: + gen_module, gen_fn = task["generator"].split(":") + grader_module, grader_cls = task["grader"].split(":") + + gen_mod = importlib.import_module(gen_module) + getattr(gen_mod, gen_fn) + + grader_mod = importlib.import_module(grader_module) + getattr(grader_mod, grader_cls) + + print(f"βœ” Task validated: {task['name']}") + + print("\nβœ… All validations passed!") + + +if __name__ == "__main__": + validate_yaml() \ No newline at end of file diff --git a/tasks/__init__.py b/tasks/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tasks/__pycache__/__init__.cpython-313.pyc b/tasks/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..77affb4509dab52b861fb95a27341f918a2da6b2 Binary files /dev/null and b/tasks/__pycache__/__init__.cpython-313.pyc differ diff --git a/tasks/__pycache__/easy.cpython-313.pyc b/tasks/__pycache__/easy.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..33bbfde277207302b56fb4494ef812f73c57e6be Binary files /dev/null and b/tasks/__pycache__/easy.cpython-313.pyc differ diff --git a/tasks/__pycache__/hard.cpython-313.pyc b/tasks/__pycache__/hard.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7c530d87e9343767bab3f56a464b43a5e5824f1b Binary files /dev/null and b/tasks/__pycache__/hard.cpython-313.pyc differ diff --git a/tasks/__pycache__/medium.cpython-313.pyc b/tasks/__pycache__/medium.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f48f228dc4e8012beb1019c666c59040de052f45 Binary files /dev/null and b/tasks/__pycache__/medium.cpython-313.pyc differ diff --git a/tasks/easy.py b/tasks/easy.py new file mode 100644 index 0000000000000000000000000000000000000000..ad08167d2b1d16126ae02a861a8f1e746c059ee0 --- /dev/null +++ b/tasks/easy.py @@ -0,0 +1,33 @@ +from app.state import EnvironmentState, Email, HiddenEmailState + + +def create_easy_task(): + email = Email( + id="1", + sender="boss@company.com", + subject="Team Meeting", + body="Schedule a meeting tomorrow at 10am" + ) + + hidden = HiddenEmailState( + email_id="1", + true_intent="meeting_request", + urgency="high", + requires_response=True, + deadline=5, + missing_information=False + ) + + state = EnvironmentState( + emails=[email], + tasks=[], + calendar=[], + hidden_email_states=[hidden] + ) + + ground_truth = { + "correct_action": "classify", + "label": "meeting_request" + } + + return state, ground_truth \ No newline at end of file diff --git a/tasks/generator.py b/tasks/generator.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tasks/hard.py b/tasks/hard.py new file mode 100644 index 0000000000000000000000000000000000000000..4f42755a161daf25577bfe63eff6753ff5ef1026 --- /dev/null +++ b/tasks/hard.py @@ -0,0 +1,36 @@ +from app.state import EnvironmentState, Email, HiddenEmailState + + +def create_hard_task(): + email = Email( + id="1", + sender="manager@company.com", + subject="Catch up", + body="Let's meet sometime next week" + ) + + hidden = HiddenEmailState( + email_id="1", + true_intent="meeting_request", + urgency="medium", + requires_response=True, + deadline=8, + missing_information=True + ) + + state = EnvironmentState( + emails=[email], + tasks=[], + calendar=[], + hidden_email_states=[hidden] + ) + + ground_truth = { + "sequence": [ + {"type": "request_info"}, + {"type": "classify", "label": "meeting_request"}, + {"type": "schedule"} + ] + } + + return state, ground_truth \ No newline at end of file diff --git a/tasks/medium.py b/tasks/medium.py new file mode 100644 index 0000000000000000000000000000000000000000..abcd01b99535f5777134f342a7f1edfaea2e42e8 --- /dev/null +++ b/tasks/medium.py @@ -0,0 +1,35 @@ +from app.state import EnvironmentState, Email, HiddenEmailState + + +def create_medium_task(): + email = Email( + id="1", + sender="client@company.com", + subject="Project Update", + body="Please send the latest report and confirm timeline" + ) + + hidden = HiddenEmailState( + email_id="1", + true_intent="task_request", + urgency="medium", + requires_response=True, + deadline=6, + missing_information=False + ) + + state = EnvironmentState( + emails=[email], + tasks=[], + calendar=[], + hidden_email_states=[hidden] + ) + + ground_truth = { + "sequence": [ + {"type": "classify", "label": "task_request"}, + {"type": "reply"} + ] + } + + return state, ground_truth \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/__pycache__/__init__.cpython-313.pyc b/tests/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d2d32909920efbcf20394b4dbcc59be7c864baf1 Binary files /dev/null and b/tests/__pycache__/__init__.cpython-313.pyc differ diff --git a/tests/__pycache__/test_env.cpython-313.pyc b/tests/__pycache__/test_env.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d02bdcecfba77f8e06d41f487ecd58d47d17eaca Binary files /dev/null and b/tests/__pycache__/test_env.cpython-313.pyc differ diff --git a/tests/__pycache__/test_graders.cpython-313.pyc b/tests/__pycache__/test_graders.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f8c6403b927b5089fcb662e285492cdf75593d58 Binary files /dev/null and b/tests/__pycache__/test_graders.cpython-313.pyc differ diff --git a/tests/__pycache__/test_info_cost.cpython-313.pyc b/tests/__pycache__/test_info_cost.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ab094cebe22c903e49cc65425be1733d6b5cda33 Binary files /dev/null and b/tests/__pycache__/test_info_cost.cpython-313.pyc differ diff --git a/tests/test_env.py b/tests/test_env.py new file mode 100644 index 0000000000000000000000000000000000000000..7d0d8ceb46cc0d285e4426e6102c4e834776f2c6 --- /dev/null +++ b/tests/test_env.py @@ -0,0 +1,30 @@ +from app.env import WorkflowEnv +from app.state import EnvironmentState, Email, HiddenEmailState + +initial_state = EnvironmentState( + emails=[ + Email(id="1", sender="boss@company.com", subject="Meeting", body="Let's meet") + ], + tasks=[], + calendar=[], + hidden_email_states=[ + HiddenEmailState( + email_id="1", + true_intent="meeting_request", + urgency="high", + requires_response=True, + deadline=5, + missing_information=True + ) + ] +) + +env = WorkflowEnv(initial_state) + +obs = env.reset() +print(obs) + +from app.actions import Action +obs, reward, done, _ = env.step(Action(type="archive", target_id="1")) + +print(obs, reward, done) \ No newline at end of file diff --git a/tests/test_graders.py b/tests/test_graders.py new file mode 100644 index 0000000000000000000000000000000000000000..b1dba7c98ccd2c956a65adeb7cb38cfb9fd9ad2b --- /dev/null +++ b/tests/test_graders.py @@ -0,0 +1,20 @@ +from tasks.easy import create_easy_task +from app.env import WorkflowEnv +from app.actions import Action +from graders.easy_grader import EasyGrader + +state, gt = create_easy_task() +env = WorkflowEnv(state) + +obs = env.reset() + +# Correct action +action = Action(type="classify", target_id="1", payload={"label": "meeting_request"}) +env.step(action) + +trajectory = env.state().history + +grader = EasyGrader() +score = grader.grade(trajectory, gt) + +print("Score:", score) \ No newline at end of file diff --git a/tests/test_info_cost.py b/tests/test_info_cost.py new file mode 100644 index 0000000000000000000000000000000000000000..943d47d342313a7dae03ee2d8409f3d85d8b86a1 --- /dev/null +++ b/tests/test_info_cost.py @@ -0,0 +1,41 @@ +from tasks.hard import create_hard_task +from app.env import WorkflowEnv +from app.actions import Action + + +def test_info_cost(): + # ------------------------ + # CASE 1: WITH info + # ------------------------ + state, _ = create_hard_task() + env = WorkflowEnv(state) + + obs = env.reset() + + action = Action(type="request_info", target_id="1") + obs, r1, _, _ = env.step(action) + + action = Action(type="classify", target_id="1", payload={"label": "meeting_request"}) + obs, r2, _, _ = env.step(action) + + print("\nWITH INFO:") + print("request_info:", r1) + print("classify:", r2) + + # ------------------------ + # CASE 2: WITHOUT info + # ------------------------ + state2, _ = create_hard_task() + env2 = WorkflowEnv(state2) + + obs = env2.reset() + + action = Action(type="classify", target_id="1", payload={"label": "meeting_request"}) + obs, r_direct, _, _ = env2.step(action) + + print("\nWITHOUT INFO:") + print("direct classify:", r_direct) + + +if __name__ == "__main__": + test_info_cost() \ No newline at end of file diff --git a/tests/test_tasks.py b/tests/test_tasks.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391