Spaces:
Running
Running
Deploy Project Epsilon Space bundle
Browse files- README.md +3 -3
- app.py +87 -6
- docs/HF_SPACE_README.md +3 -3
- src/executive_assistant/deployment.py +3 -3
- tests/test_app.py +41 -5
README.md
CHANGED
|
@@ -65,7 +65,7 @@ To make that possible under hackathon constraints, we replaced live services wit
|
|
| 65 |
- **Environment state:** in-memory SQLite workspace simulating emails, todos, files, and action history
|
| 66 |
- **OpenEnv contract:** Pydantic models defining observations, actions, rewards, and policy decisions
|
| 67 |
- **Execution loop:** shared `EpisodeRunner` used by tests, scripts, notebook experiments, and the Gradio app
|
| 68 |
-
- **App policies:** deterministic baseline and tabular RL checkpoint
|
| 69 |
- **Validator inference:** OpenRouter-backed Gemma execution through the OpenAI client compatibility layer
|
| 70 |
- **UI layer:** Gradio control room plus visible workspace snapshots for judges
|
| 71 |
|
|
@@ -88,7 +88,7 @@ The environment includes a stakeholder email asking for exact metrics from a loc
|
|
| 88 |
- Live observation payloads
|
| 89 |
- Workspace tables for emails, todos, files, and action logs
|
| 90 |
- Step-by-step trace rows with reasoning, action type, status, score, and done state
|
| 91 |
-
- Differences between `baseline` and the bundled `rl` checkpoint policy
|
| 92 |
|
| 93 |
## Runtime And Deployment Notes
|
| 94 |
|
|
@@ -106,7 +106,7 @@ The environment includes a stakeholder email asking for exact metrics from a loc
|
|
| 106 |
|
| 107 |
1. Open the Space and choose one of the seeded scenarios.
|
| 108 |
2. Run the deterministic `baseline` policy for a guaranteed reference trace.
|
| 109 |
-
3. Switch to `rl`
|
| 110 |
4. Compare how the workspace mutates after each step instead of evaluating only the final response.
|
| 111 |
5. Use the root `inference.py` path for OpenRouter-backed Gemma evaluation when the validator runs model inference.
|
| 112 |
|
|
|
|
| 65 |
- **Environment state:** in-memory SQLite workspace simulating emails, todos, files, and action history
|
| 66 |
- **OpenEnv contract:** Pydantic models defining observations, actions, rewards, and policy decisions
|
| 67 |
- **Execution loop:** shared `EpisodeRunner` used by tests, scripts, notebook experiments, and the Gradio app
|
| 68 |
+
- **App policies:** deterministic baseline and an RL mode where OpenRouter Gemma generates actions using the tabular RL checkpoint recommendation as guidance
|
| 69 |
- **Validator inference:** OpenRouter-backed Gemma execution through the OpenAI client compatibility layer
|
| 70 |
- **UI layer:** Gradio control room plus visible workspace snapshots for judges
|
| 71 |
|
|
|
|
| 88 |
- Live observation payloads
|
| 89 |
- Workspace tables for emails, todos, files, and action logs
|
| 90 |
- Step-by-step trace rows with reasoning, action type, status, score, and done state
|
| 91 |
+
- Differences between `baseline` and the OpenRouter-guided bundled `rl` checkpoint policy
|
| 92 |
|
| 93 |
## Runtime And Deployment Notes
|
| 94 |
|
|
|
|
| 106 |
|
| 107 |
1. Open the Space and choose one of the seeded scenarios.
|
| 108 |
2. Run the deterministic `baseline` policy for a guaranteed reference trace.
|
| 109 |
+
3. Switch to `rl` so Gemma receives the learned checkpoint recommendation and generates the runtime action.
|
| 110 |
4. Compare how the workspace mutates after each step instead of evaluating only the final response.
|
| 111 |
5. Use the root `inference.py` path for OpenRouter-backed Gemma evaluation when the validator runs model inference.
|
| 112 |
|
app.py
CHANGED
|
@@ -1,15 +1,17 @@
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
import json
|
|
|
|
| 4 |
import time
|
| 5 |
import uuid
|
| 6 |
from html import escape
|
| 7 |
|
| 8 |
import gradio as gr
|
| 9 |
|
| 10 |
-
from src.executive_assistant.agent import BaselineAgent
|
| 11 |
-
from src.executive_assistant.config import AppRuntimeConfig, load_env_file
|
| 12 |
from src.executive_assistant.env import ExecutiveAssistantEnv
|
|
|
|
| 13 |
from src.executive_assistant.runner import EpisodeRunner
|
| 14 |
from src.executive_assistant.training import QLearningPolicy, default_checkpoint_path, train_q_learning
|
| 15 |
|
|
@@ -758,6 +760,79 @@ def _ensure_rl_checkpoint(checkpoint_path: str) -> str:
|
|
| 758 |
return str(saved_path)
|
| 759 |
|
| 760 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 761 |
def _build_policy(
|
| 762 |
provider: str,
|
| 763 |
checkpoint_path: str,
|
|
@@ -765,7 +840,13 @@ def _build_policy(
|
|
| 765 |
if provider == "baseline":
|
| 766 |
return BaselineAgent()
|
| 767 |
if provider == "rl":
|
| 768 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 769 |
raise ValueError(f"Unsupported app policy provider: {provider}")
|
| 770 |
|
| 771 |
|
|
@@ -926,7 +1007,7 @@ with gr.Blocks(title="Autonomous Executive Assistant Sandbox") as demo:
|
|
| 926 |
<h1>Executive Assistant Sandbox</h1>
|
| 927 |
<p>
|
| 928 |
Run the exact same episode loop used in training, inspect each workspace mutation in real time,
|
| 929 |
-
and compare the deterministic baseline against the
|
| 930 |
</p>
|
| 931 |
<div class="hero-strip">
|
| 932 |
<div class="hero-pill">Shared EpisodeRunner path</div>
|
|
@@ -954,7 +1035,7 @@ with gr.Blocks(title="Autonomous Executive Assistant Sandbox") as demo:
|
|
| 954 |
"""
|
| 955 |
<h2 class="panel-title">Control Room</h2>
|
| 956 |
<p class="panel-copy">
|
| 957 |
-
Pick a scenario, choose baseline or the trained RL JSON checkpoint, and run a stepwise episode against the same environment used by training and evaluation.
|
| 958 |
</p>
|
| 959 |
"""
|
| 960 |
)
|
|
@@ -992,7 +1073,7 @@ with gr.Blocks(title="Autonomous Executive Assistant Sandbox") as demo:
|
|
| 992 |
gr.HTML(
|
| 993 |
"""
|
| 994 |
<p class="footnote">
|
| 995 |
-
The RL policy
|
| 996 |
</p>
|
| 997 |
"""
|
| 998 |
)
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
import json
|
| 4 |
+
import os
|
| 5 |
import time
|
| 6 |
import uuid
|
| 7 |
from html import escape
|
| 8 |
|
| 9 |
import gradio as gr
|
| 10 |
|
| 11 |
+
from src.executive_assistant.agent import BaselineAgent, OpenRouterPolicy
|
| 12 |
+
from src.executive_assistant.config import AppRuntimeConfig, OpenRouterConfig, load_env_file
|
| 13 |
from src.executive_assistant.env import ExecutiveAssistantEnv
|
| 14 |
+
from src.executive_assistant.models import PolicyDecision, WorkspaceObservation
|
| 15 |
from src.executive_assistant.runner import EpisodeRunner
|
| 16 |
from src.executive_assistant.training import QLearningPolicy, default_checkpoint_path, train_q_learning
|
| 17 |
|
|
|
|
| 760 |
return str(saved_path)
|
| 761 |
|
| 762 |
|
| 763 |
+
class OpenRouterGuidedCheckpointPolicy:
|
| 764 |
+
def __init__(
|
| 765 |
+
self,
|
| 766 |
+
checkpoint_policy: QLearningPolicy,
|
| 767 |
+
model_policy: OpenRouterPolicy | None,
|
| 768 |
+
) -> None:
|
| 769 |
+
self.checkpoint_policy = checkpoint_policy
|
| 770 |
+
self.model_policy = model_policy
|
| 771 |
+
|
| 772 |
+
def choose_action(self, task_name: str, observation: WorkspaceObservation) -> PolicyDecision:
|
| 773 |
+
checkpoint_decision = self.checkpoint_policy.choose_action(task_name, observation)
|
| 774 |
+
if self.model_policy is None:
|
| 775 |
+
return PolicyDecision(
|
| 776 |
+
reasoning=(
|
| 777 |
+
"OpenRouter model is not configured; using the trained RL checkpoint action. "
|
| 778 |
+
f"{checkpoint_decision.reasoning}"
|
| 779 |
+
),
|
| 780 |
+
action=checkpoint_decision.action,
|
| 781 |
+
)
|
| 782 |
+
guided_observation = observation.model_copy(
|
| 783 |
+
update={
|
| 784 |
+
"action_history": observation.action_history
|
| 785 |
+
+ [
|
| 786 |
+
(
|
| 787 |
+
"Trained RL checkpoint recommendation: "
|
| 788 |
+
f"reasoning={checkpoint_decision.reasoning}; "
|
| 789 |
+
f"action={checkpoint_decision.action.model_dump()}"
|
| 790 |
+
)
|
| 791 |
+
]
|
| 792 |
+
}
|
| 793 |
+
)
|
| 794 |
+
try:
|
| 795 |
+
model_decision = self.model_policy.choose_action(task_name, guided_observation)
|
| 796 |
+
except Exception as exc:
|
| 797 |
+
return PolicyDecision(
|
| 798 |
+
reasoning=(
|
| 799 |
+
f"OpenRouter model call failed ({exc}); using the trained RL checkpoint action. "
|
| 800 |
+
f"{checkpoint_decision.reasoning}"
|
| 801 |
+
),
|
| 802 |
+
action=checkpoint_decision.action,
|
| 803 |
+
)
|
| 804 |
+
return PolicyDecision(
|
| 805 |
+
reasoning=(
|
| 806 |
+
"OpenRouter Gemma generated this action using the trained RL checkpoint recommendation. "
|
| 807 |
+
f"Model reasoning: {model_decision.reasoning} | Checkpoint recommendation: "
|
| 808 |
+
f"{checkpoint_decision.reasoning}"
|
| 809 |
+
),
|
| 810 |
+
action=model_decision.action,
|
| 811 |
+
)
|
| 812 |
+
|
| 813 |
+
|
| 814 |
+
def _build_openrouter_policy() -> OpenRouterPolicy | None:
|
| 815 |
+
api_key = os.environ.get("OPENROUTER_API_KEY", "").strip() or os.environ.get(
|
| 816 |
+
"OPENAI_API_KEY",
|
| 817 |
+
"",
|
| 818 |
+
).strip()
|
| 819 |
+
if not api_key:
|
| 820 |
+
return None
|
| 821 |
+
config = OpenRouterConfig(
|
| 822 |
+
api_key=api_key,
|
| 823 |
+
base_url=os.environ.get("OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1"),
|
| 824 |
+
model_name=os.environ.get("OPENROUTER_MODEL", "google/gemma-4-31b-it"),
|
| 825 |
+
site_url=os.environ.get("OPENROUTER_SITE_URL", "http://localhost:7860"),
|
| 826 |
+
app_name=os.environ.get(
|
| 827 |
+
"OPENROUTER_APP_NAME",
|
| 828 |
+
"EmailMaestro | Executive Assistant Sandbox",
|
| 829 |
+
),
|
| 830 |
+
temperature=float(os.environ.get("OPENROUTER_TEMPERATURE", "0.1")),
|
| 831 |
+
max_tokens=int(os.environ.get("OPENROUTER_MAX_TOKENS", "600")),
|
| 832 |
+
)
|
| 833 |
+
return OpenRouterPolicy(config=config)
|
| 834 |
+
|
| 835 |
+
|
| 836 |
def _build_policy(
|
| 837 |
provider: str,
|
| 838 |
checkpoint_path: str,
|
|
|
|
| 840 |
if provider == "baseline":
|
| 841 |
return BaselineAgent()
|
| 842 |
if provider == "rl":
|
| 843 |
+
checkpoint_policy = QLearningPolicy.load(
|
| 844 |
+
_ensure_rl_checkpoint(checkpoint_path or _default_rl_checkpoint())
|
| 845 |
+
)
|
| 846 |
+
return OpenRouterGuidedCheckpointPolicy(
|
| 847 |
+
checkpoint_policy=checkpoint_policy,
|
| 848 |
+
model_policy=_build_openrouter_policy(),
|
| 849 |
+
)
|
| 850 |
raise ValueError(f"Unsupported app policy provider: {provider}")
|
| 851 |
|
| 852 |
|
|
|
|
| 1007 |
<h1>Executive Assistant Sandbox</h1>
|
| 1008 |
<p>
|
| 1009 |
Run the exact same episode loop used in training, inspect each workspace mutation in real time,
|
| 1010 |
+
and compare the deterministic baseline against the OpenRouter-guided RL checkpoint without losing the structure of the task.
|
| 1011 |
</p>
|
| 1012 |
<div class="hero-strip">
|
| 1013 |
<div class="hero-pill">Shared EpisodeRunner path</div>
|
|
|
|
| 1035 |
"""
|
| 1036 |
<h2 class="panel-title">Control Room</h2>
|
| 1037 |
<p class="panel-copy">
|
| 1038 |
+
Pick a scenario, choose baseline or the OpenRouter-guided trained RL JSON checkpoint, and run a stepwise episode against the same environment used by training and evaluation.
|
| 1039 |
</p>
|
| 1040 |
"""
|
| 1041 |
)
|
|
|
|
| 1073 |
gr.HTML(
|
| 1074 |
"""
|
| 1075 |
<p class="footnote">
|
| 1076 |
+
The RL policy loads the trained JSON checkpoint as guidance, then asks OpenRouter Gemma through the OpenAI client to generate the runtime action. If the model call fails, it falls back to the checkpoint action.
|
| 1077 |
</p>
|
| 1078 |
"""
|
| 1079 |
)
|
docs/HF_SPACE_README.md
CHANGED
|
@@ -65,7 +65,7 @@ To make that possible under hackathon constraints, we replaced live services wit
|
|
| 65 |
- **Environment state:** in-memory SQLite workspace simulating emails, todos, files, and action history
|
| 66 |
- **OpenEnv contract:** Pydantic models defining observations, actions, rewards, and policy decisions
|
| 67 |
- **Execution loop:** shared `EpisodeRunner` used by tests, scripts, notebook experiments, and the Gradio app
|
| 68 |
-
- **App policies:** deterministic baseline and tabular RL checkpoint
|
| 69 |
- **Validator inference:** OpenRouter-backed Gemma execution through the OpenAI client compatibility layer
|
| 70 |
- **UI layer:** Gradio control room plus visible workspace snapshots for judges
|
| 71 |
|
|
@@ -88,7 +88,7 @@ The environment includes a stakeholder email asking for exact metrics from a loc
|
|
| 88 |
- Live observation payloads
|
| 89 |
- Workspace tables for emails, todos, files, and action logs
|
| 90 |
- Step-by-step trace rows with reasoning, action type, status, score, and done state
|
| 91 |
-
- Differences between `baseline` and the bundled `rl` checkpoint policy
|
| 92 |
|
| 93 |
## Runtime And Deployment Notes
|
| 94 |
|
|
@@ -105,7 +105,7 @@ The environment includes a stakeholder email asking for exact metrics from a loc
|
|
| 105 |
|
| 106 |
1. Open the Space and choose one of the seeded scenarios.
|
| 107 |
2. Run the deterministic `baseline` policy for a guaranteed reference trace.
|
| 108 |
-
3. Switch to `rl`
|
| 109 |
4. Compare how the workspace mutates after each step instead of evaluating only the final response.
|
| 110 |
5. Use the root `inference.py` path for OpenRouter-backed Gemma evaluation when the validator runs model inference.
|
| 111 |
|
|
|
|
| 65 |
- **Environment state:** in-memory SQLite workspace simulating emails, todos, files, and action history
|
| 66 |
- **OpenEnv contract:** Pydantic models defining observations, actions, rewards, and policy decisions
|
| 67 |
- **Execution loop:** shared `EpisodeRunner` used by tests, scripts, notebook experiments, and the Gradio app
|
| 68 |
+
- **App policies:** deterministic baseline and an RL mode where OpenRouter Gemma generates actions using the tabular RL checkpoint recommendation as guidance
|
| 69 |
- **Validator inference:** OpenRouter-backed Gemma execution through the OpenAI client compatibility layer
|
| 70 |
- **UI layer:** Gradio control room plus visible workspace snapshots for judges
|
| 71 |
|
|
|
|
| 88 |
- Live observation payloads
|
| 89 |
- Workspace tables for emails, todos, files, and action logs
|
| 90 |
- Step-by-step trace rows with reasoning, action type, status, score, and done state
|
| 91 |
+
- Differences between `baseline` and the OpenRouter-guided bundled `rl` checkpoint policy
|
| 92 |
|
| 93 |
## Runtime And Deployment Notes
|
| 94 |
|
|
|
|
| 105 |
|
| 106 |
1. Open the Space and choose one of the seeded scenarios.
|
| 107 |
2. Run the deterministic `baseline` policy for a guaranteed reference trace.
|
| 108 |
+
3. Switch to `rl` so Gemma receives the learned checkpoint recommendation and generates the runtime action.
|
| 109 |
4. Compare how the workspace mutates after each step instead of evaluating only the final response.
|
| 110 |
5. Use the root `inference.py` path for OpenRouter-backed Gemma evaluation when the validator runs model inference.
|
| 111 |
|
src/executive_assistant/deployment.py
CHANGED
|
@@ -155,7 +155,7 @@ To make that possible under hackathon constraints, we replaced live services wit
|
|
| 155 |
- **Environment state:** in-memory SQLite workspace simulating emails, todos, files, and action history
|
| 156 |
- **OpenEnv contract:** Pydantic models defining observations, actions, rewards, and policy decisions
|
| 157 |
- **Execution loop:** shared `EpisodeRunner` used by tests, scripts, notebook experiments, and the Gradio app
|
| 158 |
-
- **App policies:** deterministic baseline and tabular RL checkpoint
|
| 159 |
- **Validator inference:** OpenRouter-backed Gemma execution through the OpenAI client compatibility layer
|
| 160 |
- **UI layer:** Gradio control room plus visible workspace snapshots for judges
|
| 161 |
|
|
@@ -178,7 +178,7 @@ The environment includes a stakeholder email asking for exact metrics from a loc
|
|
| 178 |
- Live observation payloads
|
| 179 |
- Workspace tables for emails, todos, files, and action logs
|
| 180 |
- Step-by-step trace rows with reasoning, action type, status, score, and done state
|
| 181 |
-
- Differences between `baseline` and the bundled `rl` checkpoint policy
|
| 182 |
|
| 183 |
## Runtime And Deployment Notes
|
| 184 |
|
|
@@ -196,7 +196,7 @@ The environment includes a stakeholder email asking for exact metrics from a loc
|
|
| 196 |
|
| 197 |
1. Open the Space and choose one of the seeded scenarios.
|
| 198 |
2. Run the deterministic `baseline` policy for a guaranteed reference trace.
|
| 199 |
-
3. Switch to `rl`
|
| 200 |
4. Compare how the workspace mutates after each step instead of evaluating only the final response.
|
| 201 |
5. Use the root `inference.py` path for OpenRouter-backed Gemma evaluation when the validator runs model inference.
|
| 202 |
|
|
|
|
| 155 |
- **Environment state:** in-memory SQLite workspace simulating emails, todos, files, and action history
|
| 156 |
- **OpenEnv contract:** Pydantic models defining observations, actions, rewards, and policy decisions
|
| 157 |
- **Execution loop:** shared `EpisodeRunner` used by tests, scripts, notebook experiments, and the Gradio app
|
| 158 |
+
- **App policies:** deterministic baseline and an RL mode where OpenRouter Gemma generates actions using the tabular RL checkpoint recommendation as guidance
|
| 159 |
- **Validator inference:** OpenRouter-backed Gemma execution through the OpenAI client compatibility layer
|
| 160 |
- **UI layer:** Gradio control room plus visible workspace snapshots for judges
|
| 161 |
|
|
|
|
| 178 |
- Live observation payloads
|
| 179 |
- Workspace tables for emails, todos, files, and action logs
|
| 180 |
- Step-by-step trace rows with reasoning, action type, status, score, and done state
|
| 181 |
+
- Differences between `baseline` and the OpenRouter-guided bundled `rl` checkpoint policy
|
| 182 |
|
| 183 |
## Runtime And Deployment Notes
|
| 184 |
|
|
|
|
| 196 |
|
| 197 |
1. Open the Space and choose one of the seeded scenarios.
|
| 198 |
2. Run the deterministic `baseline` policy for a guaranteed reference trace.
|
| 199 |
+
3. Switch to `rl` so Gemma receives the learned checkpoint recommendation and generates the runtime action.
|
| 200 |
4. Compare how the workspace mutates after each step instead of evaluating only the final response.
|
| 201 |
5. Use the root `inference.py` path for OpenRouter-backed Gemma evaluation when the validator runs model inference.
|
| 202 |
|
tests/test_app.py
CHANGED
|
@@ -1,10 +1,13 @@
|
|
| 1 |
from pathlib import Path
|
| 2 |
|
| 3 |
from src.executive_assistant.agent import BaselineAgent
|
|
|
|
| 4 |
from src.executive_assistant.training import train_q_learning
|
| 5 |
|
| 6 |
|
| 7 |
-
def test_app_builds_rl_policy_from_checkpoint(tmp_path) -> None:
|
|
|
|
|
|
|
| 8 |
from app import _build_policy
|
| 9 |
|
| 10 |
policy, _ = train_q_learning(episodes=12, epsilon=0.1, teacher=BaselineAgent())
|
|
@@ -13,10 +16,12 @@ def test_app_builds_rl_policy_from_checkpoint(tmp_path) -> None:
|
|
| 13 |
provider="rl",
|
| 14 |
checkpoint_path=str(checkpoint),
|
| 15 |
)
|
| 16 |
-
assert loaded_policy.epsilon == 0.0
|
| 17 |
|
| 18 |
|
| 19 |
-
def test_app_builds_missing_rl_checkpoint(tmp_path) -> None:
|
|
|
|
|
|
|
| 20 |
from app import _build_policy
|
| 21 |
|
| 22 |
checkpoint = tmp_path / "missing" / "q_policy.json"
|
|
@@ -24,11 +29,42 @@ def test_app_builds_missing_rl_checkpoint(tmp_path) -> None:
|
|
| 24 |
provider="rl",
|
| 25 |
checkpoint_path=str(checkpoint),
|
| 26 |
)
|
| 27 |
-
assert loaded_policy.epsilon == 0.0
|
| 28 |
assert checkpoint.exists()
|
| 29 |
|
| 30 |
|
| 31 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
from app import run_live_episode
|
| 33 |
|
| 34 |
policy, _ = train_q_learning(episodes=12, epsilon=0.1, teacher=BaselineAgent())
|
|
|
|
| 1 |
from pathlib import Path
|
| 2 |
|
| 3 |
from src.executive_assistant.agent import BaselineAgent
|
| 4 |
+
from src.executive_assistant.models import AssistantAction, PolicyDecision
|
| 5 |
from src.executive_assistant.training import train_q_learning
|
| 6 |
|
| 7 |
|
| 8 |
+
def test_app_builds_rl_policy_from_checkpoint(tmp_path, monkeypatch) -> None:
|
| 9 |
+
monkeypatch.setenv("OPENROUTER_API_KEY", "")
|
| 10 |
+
monkeypatch.setenv("OPENAI_API_KEY", "")
|
| 11 |
from app import _build_policy
|
| 12 |
|
| 13 |
policy, _ = train_q_learning(episodes=12, epsilon=0.1, teacher=BaselineAgent())
|
|
|
|
| 16 |
provider="rl",
|
| 17 |
checkpoint_path=str(checkpoint),
|
| 18 |
)
|
| 19 |
+
assert loaded_policy.checkpoint_policy.epsilon == 0.0
|
| 20 |
|
| 21 |
|
| 22 |
+
def test_app_builds_missing_rl_checkpoint(tmp_path, monkeypatch) -> None:
|
| 23 |
+
monkeypatch.setenv("OPENROUTER_API_KEY", "")
|
| 24 |
+
monkeypatch.setenv("OPENAI_API_KEY", "")
|
| 25 |
from app import _build_policy
|
| 26 |
|
| 27 |
checkpoint = tmp_path / "missing" / "q_policy.json"
|
|
|
|
| 29 |
provider="rl",
|
| 30 |
checkpoint_path=str(checkpoint),
|
| 31 |
)
|
| 32 |
+
assert loaded_policy.checkpoint_policy.epsilon == 0.0
|
| 33 |
assert checkpoint.exists()
|
| 34 |
|
| 35 |
|
| 36 |
+
def test_rl_policy_uses_openrouter_model_with_checkpoint_guidance() -> None:
|
| 37 |
+
from app import OpenRouterGuidedCheckpointPolicy
|
| 38 |
+
from src.executive_assistant.env import ExecutiveAssistantEnv
|
| 39 |
+
|
| 40 |
+
class StubModelPolicy:
|
| 41 |
+
def __init__(self) -> None:
|
| 42 |
+
self.observation = None
|
| 43 |
+
|
| 44 |
+
def choose_action(self, task_name, observation):
|
| 45 |
+
self.observation = observation
|
| 46 |
+
return PolicyDecision(
|
| 47 |
+
reasoning="Followed the checkpoint hint.",
|
| 48 |
+
action=AssistantAction(action_type="read_email", target_id=1),
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
q_policy, _ = train_q_learning(episodes=12, epsilon=0.1, teacher=BaselineAgent())
|
| 52 |
+
model_policy = StubModelPolicy()
|
| 53 |
+
policy = OpenRouterGuidedCheckpointPolicy(q_policy, model_policy)
|
| 54 |
+
env = ExecutiveAssistantEnv(task_name="easy_deadline_extraction")
|
| 55 |
+
decision = policy.choose_action("easy_deadline_extraction", env.reset())
|
| 56 |
+
assert decision.action.action_type == "read_email"
|
| 57 |
+
assert "OpenRouter Gemma generated" in decision.reasoning
|
| 58 |
+
assert model_policy.observation is not None
|
| 59 |
+
assert any(
|
| 60 |
+
"Trained RL checkpoint recommendation" in entry
|
| 61 |
+
for entry in model_policy.observation.action_history
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def test_app_stepwise_episode_generator_yields_updates(tmp_path, monkeypatch) -> None:
|
| 66 |
+
monkeypatch.setenv("OPENROUTER_API_KEY", "")
|
| 67 |
+
monkeypatch.setenv("OPENAI_API_KEY", "")
|
| 68 |
from app import run_live_episode
|
| 69 |
|
| 70 |
policy, _ = train_q_learning(episodes=12, epsilon=0.1, teacher=BaselineAgent())
|