Flickinshots commited on
Commit
55f5aac
·
verified ·
1 Parent(s): 0ae427e

Deploy Project Epsilon Space bundle

Browse files
README.md CHANGED
@@ -65,7 +65,7 @@ To make that possible under hackathon constraints, we replaced live services wit
65
  - **Environment state:** in-memory SQLite workspace simulating emails, todos, files, and action history
66
  - **OpenEnv contract:** Pydantic models defining observations, actions, rewards, and policy decisions
67
  - **Execution loop:** shared `EpisodeRunner` used by tests, scripts, notebook experiments, and the Gradio app
68
- - **App policies:** deterministic baseline and tabular RL checkpoint replay
69
  - **Validator inference:** OpenRouter-backed Gemma execution through the OpenAI client compatibility layer
70
  - **UI layer:** Gradio control room plus visible workspace snapshots for judges
71
 
@@ -88,7 +88,7 @@ The environment includes a stakeholder email asking for exact metrics from a loc
88
  - Live observation payloads
89
  - Workspace tables for emails, todos, files, and action logs
90
  - Step-by-step trace rows with reasoning, action type, status, score, and done state
91
- - Differences between `baseline` and the bundled `rl` checkpoint policy
92
 
93
  ## Runtime And Deployment Notes
94
 
@@ -106,7 +106,7 @@ The environment includes a stakeholder email asking for exact metrics from a loc
106
 
107
  1. Open the Space and choose one of the seeded scenarios.
108
  2. Run the deterministic `baseline` policy for a guaranteed reference trace.
109
- 3. Switch to `rl` to replay the bundled learned checkpoint.
110
  4. Compare how the workspace mutates after each step instead of evaluating only the final response.
111
  5. Use the root `inference.py` path for OpenRouter-backed Gemma evaluation when the validator runs model inference.
112
 
 
65
  - **Environment state:** in-memory SQLite workspace simulating emails, todos, files, and action history
66
  - **OpenEnv contract:** Pydantic models defining observations, actions, rewards, and policy decisions
67
  - **Execution loop:** shared `EpisodeRunner` used by tests, scripts, notebook experiments, and the Gradio app
68
+ - **App policies:** deterministic baseline and an RL mode where OpenRouter Gemma generates actions using the tabular RL checkpoint recommendation as guidance
69
  - **Validator inference:** OpenRouter-backed Gemma execution through the OpenAI client compatibility layer
70
  - **UI layer:** Gradio control room plus visible workspace snapshots for judges
71
 
 
88
  - Live observation payloads
89
  - Workspace tables for emails, todos, files, and action logs
90
  - Step-by-step trace rows with reasoning, action type, status, score, and done state
91
+ - Differences between `baseline` and the OpenRouter-guided bundled `rl` checkpoint policy
92
 
93
  ## Runtime And Deployment Notes
94
 
 
106
 
107
  1. Open the Space and choose one of the seeded scenarios.
108
  2. Run the deterministic `baseline` policy for a guaranteed reference trace.
109
+ 3. Switch to `rl` so Gemma receives the learned checkpoint recommendation and generates the runtime action.
110
  4. Compare how the workspace mutates after each step instead of evaluating only the final response.
111
  5. Use the root `inference.py` path for OpenRouter-backed Gemma evaluation when the validator runs model inference.
112
 
app.py CHANGED
@@ -1,15 +1,17 @@
1
  from __future__ import annotations
2
 
3
  import json
 
4
  import time
5
  import uuid
6
  from html import escape
7
 
8
  import gradio as gr
9
 
10
- from src.executive_assistant.agent import BaselineAgent
11
- from src.executive_assistant.config import AppRuntimeConfig, load_env_file
12
  from src.executive_assistant.env import ExecutiveAssistantEnv
 
13
  from src.executive_assistant.runner import EpisodeRunner
14
  from src.executive_assistant.training import QLearningPolicy, default_checkpoint_path, train_q_learning
15
 
@@ -758,6 +760,79 @@ def _ensure_rl_checkpoint(checkpoint_path: str) -> str:
758
  return str(saved_path)
759
 
760
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
761
  def _build_policy(
762
  provider: str,
763
  checkpoint_path: str,
@@ -765,7 +840,13 @@ def _build_policy(
765
  if provider == "baseline":
766
  return BaselineAgent()
767
  if provider == "rl":
768
- return QLearningPolicy.load(_ensure_rl_checkpoint(checkpoint_path or _default_rl_checkpoint()))
 
 
 
 
 
 
769
  raise ValueError(f"Unsupported app policy provider: {provider}")
770
 
771
 
@@ -926,7 +1007,7 @@ with gr.Blocks(title="Autonomous Executive Assistant Sandbox") as demo:
926
  <h1>Executive Assistant Sandbox</h1>
927
  <p>
928
  Run the exact same episode loop used in training, inspect each workspace mutation in real time,
929
- and compare the deterministic baseline against the trained RL checkpoint without losing the structure of the task.
930
  </p>
931
  <div class="hero-strip">
932
  <div class="hero-pill">Shared EpisodeRunner path</div>
@@ -954,7 +1035,7 @@ with gr.Blocks(title="Autonomous Executive Assistant Sandbox") as demo:
954
  """
955
  <h2 class="panel-title">Control Room</h2>
956
  <p class="panel-copy">
957
- Pick a scenario, choose baseline or the trained RL JSON checkpoint, and run a stepwise episode against the same environment used by training and evaluation.
958
  </p>
959
  """
960
  )
@@ -992,7 +1073,7 @@ with gr.Blocks(title="Autonomous Executive Assistant Sandbox") as demo:
992
  gr.HTML(
993
  """
994
  <p class="footnote">
995
- The RL policy always replays a trained JSON checkpoint. OpenRouter is reserved for the separate validator-facing inference script, not the live app policy controls.
996
  </p>
997
  """
998
  )
 
1
  from __future__ import annotations
2
 
3
  import json
4
+ import os
5
  import time
6
  import uuid
7
  from html import escape
8
 
9
  import gradio as gr
10
 
11
+ from src.executive_assistant.agent import BaselineAgent, OpenRouterPolicy
12
+ from src.executive_assistant.config import AppRuntimeConfig, OpenRouterConfig, load_env_file
13
  from src.executive_assistant.env import ExecutiveAssistantEnv
14
+ from src.executive_assistant.models import PolicyDecision, WorkspaceObservation
15
  from src.executive_assistant.runner import EpisodeRunner
16
  from src.executive_assistant.training import QLearningPolicy, default_checkpoint_path, train_q_learning
17
 
 
760
  return str(saved_path)
761
 
762
 
763
+ class OpenRouterGuidedCheckpointPolicy:
764
+ def __init__(
765
+ self,
766
+ checkpoint_policy: QLearningPolicy,
767
+ model_policy: OpenRouterPolicy | None,
768
+ ) -> None:
769
+ self.checkpoint_policy = checkpoint_policy
770
+ self.model_policy = model_policy
771
+
772
+ def choose_action(self, task_name: str, observation: WorkspaceObservation) -> PolicyDecision:
773
+ checkpoint_decision = self.checkpoint_policy.choose_action(task_name, observation)
774
+ if self.model_policy is None:
775
+ return PolicyDecision(
776
+ reasoning=(
777
+ "OpenRouter model is not configured; using the trained RL checkpoint action. "
778
+ f"{checkpoint_decision.reasoning}"
779
+ ),
780
+ action=checkpoint_decision.action,
781
+ )
782
+ guided_observation = observation.model_copy(
783
+ update={
784
+ "action_history": observation.action_history
785
+ + [
786
+ (
787
+ "Trained RL checkpoint recommendation: "
788
+ f"reasoning={checkpoint_decision.reasoning}; "
789
+ f"action={checkpoint_decision.action.model_dump()}"
790
+ )
791
+ ]
792
+ }
793
+ )
794
+ try:
795
+ model_decision = self.model_policy.choose_action(task_name, guided_observation)
796
+ except Exception as exc:
797
+ return PolicyDecision(
798
+ reasoning=(
799
+ f"OpenRouter model call failed ({exc}); using the trained RL checkpoint action. "
800
+ f"{checkpoint_decision.reasoning}"
801
+ ),
802
+ action=checkpoint_decision.action,
803
+ )
804
+ return PolicyDecision(
805
+ reasoning=(
806
+ "OpenRouter Gemma generated this action using the trained RL checkpoint recommendation. "
807
+ f"Model reasoning: {model_decision.reasoning} | Checkpoint recommendation: "
808
+ f"{checkpoint_decision.reasoning}"
809
+ ),
810
+ action=model_decision.action,
811
+ )
812
+
813
+
814
+ def _build_openrouter_policy() -> OpenRouterPolicy | None:
815
+ api_key = os.environ.get("OPENROUTER_API_KEY", "").strip() or os.environ.get(
816
+ "OPENAI_API_KEY",
817
+ "",
818
+ ).strip()
819
+ if not api_key:
820
+ return None
821
+ config = OpenRouterConfig(
822
+ api_key=api_key,
823
+ base_url=os.environ.get("OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1"),
824
+ model_name=os.environ.get("OPENROUTER_MODEL", "google/gemma-4-31b-it"),
825
+ site_url=os.environ.get("OPENROUTER_SITE_URL", "http://localhost:7860"),
826
+ app_name=os.environ.get(
827
+ "OPENROUTER_APP_NAME",
828
+ "EmailMaestro | Executive Assistant Sandbox",
829
+ ),
830
+ temperature=float(os.environ.get("OPENROUTER_TEMPERATURE", "0.1")),
831
+ max_tokens=int(os.environ.get("OPENROUTER_MAX_TOKENS", "600")),
832
+ )
833
+ return OpenRouterPolicy(config=config)
834
+
835
+
836
  def _build_policy(
837
  provider: str,
838
  checkpoint_path: str,
 
840
  if provider == "baseline":
841
  return BaselineAgent()
842
  if provider == "rl":
843
+ checkpoint_policy = QLearningPolicy.load(
844
+ _ensure_rl_checkpoint(checkpoint_path or _default_rl_checkpoint())
845
+ )
846
+ return OpenRouterGuidedCheckpointPolicy(
847
+ checkpoint_policy=checkpoint_policy,
848
+ model_policy=_build_openrouter_policy(),
849
+ )
850
  raise ValueError(f"Unsupported app policy provider: {provider}")
851
 
852
 
 
1007
  <h1>Executive Assistant Sandbox</h1>
1008
  <p>
1009
  Run the exact same episode loop used in training, inspect each workspace mutation in real time,
1010
+ and compare the deterministic baseline against the OpenRouter-guided RL checkpoint without losing the structure of the task.
1011
  </p>
1012
  <div class="hero-strip">
1013
  <div class="hero-pill">Shared EpisodeRunner path</div>
 
1035
  """
1036
  <h2 class="panel-title">Control Room</h2>
1037
  <p class="panel-copy">
1038
+ Pick a scenario, choose baseline or the OpenRouter-guided trained RL JSON checkpoint, and run a stepwise episode against the same environment used by training and evaluation.
1039
  </p>
1040
  """
1041
  )
 
1073
  gr.HTML(
1074
  """
1075
  <p class="footnote">
1076
+ The RL policy loads the trained JSON checkpoint as guidance, then asks OpenRouter Gemma through the OpenAI client to generate the runtime action. If the model call fails, it falls back to the checkpoint action.
1077
  </p>
1078
  """
1079
  )
docs/HF_SPACE_README.md CHANGED
@@ -65,7 +65,7 @@ To make that possible under hackathon constraints, we replaced live services wit
65
  - **Environment state:** in-memory SQLite workspace simulating emails, todos, files, and action history
66
  - **OpenEnv contract:** Pydantic models defining observations, actions, rewards, and policy decisions
67
  - **Execution loop:** shared `EpisodeRunner` used by tests, scripts, notebook experiments, and the Gradio app
68
- - **App policies:** deterministic baseline and tabular RL checkpoint replay
69
  - **Validator inference:** OpenRouter-backed Gemma execution through the OpenAI client compatibility layer
70
  - **UI layer:** Gradio control room plus visible workspace snapshots for judges
71
 
@@ -88,7 +88,7 @@ The environment includes a stakeholder email asking for exact metrics from a loc
88
  - Live observation payloads
89
  - Workspace tables for emails, todos, files, and action logs
90
  - Step-by-step trace rows with reasoning, action type, status, score, and done state
91
- - Differences between `baseline` and the bundled `rl` checkpoint policy
92
 
93
  ## Runtime And Deployment Notes
94
 
@@ -105,7 +105,7 @@ The environment includes a stakeholder email asking for exact metrics from a loc
105
 
106
  1. Open the Space and choose one of the seeded scenarios.
107
  2. Run the deterministic `baseline` policy for a guaranteed reference trace.
108
- 3. Switch to `rl` to replay the bundled learned checkpoint.
109
  4. Compare how the workspace mutates after each step instead of evaluating only the final response.
110
  5. Use the root `inference.py` path for OpenRouter-backed Gemma evaluation when the validator runs model inference.
111
 
 
65
  - **Environment state:** in-memory SQLite workspace simulating emails, todos, files, and action history
66
  - **OpenEnv contract:** Pydantic models defining observations, actions, rewards, and policy decisions
67
  - **Execution loop:** shared `EpisodeRunner` used by tests, scripts, notebook experiments, and the Gradio app
68
+ - **App policies:** deterministic baseline and an RL mode where OpenRouter Gemma generates actions using the tabular RL checkpoint recommendation as guidance
69
  - **Validator inference:** OpenRouter-backed Gemma execution through the OpenAI client compatibility layer
70
  - **UI layer:** Gradio control room plus visible workspace snapshots for judges
71
 
 
88
  - Live observation payloads
89
  - Workspace tables for emails, todos, files, and action logs
90
  - Step-by-step trace rows with reasoning, action type, status, score, and done state
91
+ - Differences between `baseline` and the OpenRouter-guided bundled `rl` checkpoint policy
92
 
93
  ## Runtime And Deployment Notes
94
 
 
105
 
106
  1. Open the Space and choose one of the seeded scenarios.
107
  2. Run the deterministic `baseline` policy for a guaranteed reference trace.
108
+ 3. Switch to `rl` so Gemma receives the learned checkpoint recommendation and generates the runtime action.
109
  4. Compare how the workspace mutates after each step instead of evaluating only the final response.
110
  5. Use the root `inference.py` path for OpenRouter-backed Gemma evaluation when the validator runs model inference.
111
 
src/executive_assistant/deployment.py CHANGED
@@ -155,7 +155,7 @@ To make that possible under hackathon constraints, we replaced live services wit
155
  - **Environment state:** in-memory SQLite workspace simulating emails, todos, files, and action history
156
  - **OpenEnv contract:** Pydantic models defining observations, actions, rewards, and policy decisions
157
  - **Execution loop:** shared `EpisodeRunner` used by tests, scripts, notebook experiments, and the Gradio app
158
- - **App policies:** deterministic baseline and tabular RL checkpoint replay
159
  - **Validator inference:** OpenRouter-backed Gemma execution through the OpenAI client compatibility layer
160
  - **UI layer:** Gradio control room plus visible workspace snapshots for judges
161
 
@@ -178,7 +178,7 @@ The environment includes a stakeholder email asking for exact metrics from a loc
178
  - Live observation payloads
179
  - Workspace tables for emails, todos, files, and action logs
180
  - Step-by-step trace rows with reasoning, action type, status, score, and done state
181
- - Differences between `baseline` and the bundled `rl` checkpoint policy
182
 
183
  ## Runtime And Deployment Notes
184
 
@@ -196,7 +196,7 @@ The environment includes a stakeholder email asking for exact metrics from a loc
196
 
197
  1. Open the Space and choose one of the seeded scenarios.
198
  2. Run the deterministic `baseline` policy for a guaranteed reference trace.
199
- 3. Switch to `rl` to replay the bundled learned checkpoint.
200
  4. Compare how the workspace mutates after each step instead of evaluating only the final response.
201
  5. Use the root `inference.py` path for OpenRouter-backed Gemma evaluation when the validator runs model inference.
202
 
 
155
  - **Environment state:** in-memory SQLite workspace simulating emails, todos, files, and action history
156
  - **OpenEnv contract:** Pydantic models defining observations, actions, rewards, and policy decisions
157
  - **Execution loop:** shared `EpisodeRunner` used by tests, scripts, notebook experiments, and the Gradio app
158
+ - **App policies:** deterministic baseline and an RL mode where OpenRouter Gemma generates actions using the tabular RL checkpoint recommendation as guidance
159
  - **Validator inference:** OpenRouter-backed Gemma execution through the OpenAI client compatibility layer
160
  - **UI layer:** Gradio control room plus visible workspace snapshots for judges
161
 
 
178
  - Live observation payloads
179
  - Workspace tables for emails, todos, files, and action logs
180
  - Step-by-step trace rows with reasoning, action type, status, score, and done state
181
+ - Differences between `baseline` and the OpenRouter-guided bundled `rl` checkpoint policy
182
 
183
  ## Runtime And Deployment Notes
184
 
 
196
 
197
  1. Open the Space and choose one of the seeded scenarios.
198
  2. Run the deterministic `baseline` policy for a guaranteed reference trace.
199
+ 3. Switch to `rl` so Gemma receives the learned checkpoint recommendation and generates the runtime action.
200
  4. Compare how the workspace mutates after each step instead of evaluating only the final response.
201
  5. Use the root `inference.py` path for OpenRouter-backed Gemma evaluation when the validator runs model inference.
202
 
tests/test_app.py CHANGED
@@ -1,10 +1,13 @@
1
  from pathlib import Path
2
 
3
  from src.executive_assistant.agent import BaselineAgent
 
4
  from src.executive_assistant.training import train_q_learning
5
 
6
 
7
- def test_app_builds_rl_policy_from_checkpoint(tmp_path) -> None:
 
 
8
  from app import _build_policy
9
 
10
  policy, _ = train_q_learning(episodes=12, epsilon=0.1, teacher=BaselineAgent())
@@ -13,10 +16,12 @@ def test_app_builds_rl_policy_from_checkpoint(tmp_path) -> None:
13
  provider="rl",
14
  checkpoint_path=str(checkpoint),
15
  )
16
- assert loaded_policy.epsilon == 0.0
17
 
18
 
19
- def test_app_builds_missing_rl_checkpoint(tmp_path) -> None:
 
 
20
  from app import _build_policy
21
 
22
  checkpoint = tmp_path / "missing" / "q_policy.json"
@@ -24,11 +29,42 @@ def test_app_builds_missing_rl_checkpoint(tmp_path) -> None:
24
  provider="rl",
25
  checkpoint_path=str(checkpoint),
26
  )
27
- assert loaded_policy.epsilon == 0.0
28
  assert checkpoint.exists()
29
 
30
 
31
- def test_app_stepwise_episode_generator_yields_updates(tmp_path) -> None:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  from app import run_live_episode
33
 
34
  policy, _ = train_q_learning(episodes=12, epsilon=0.1, teacher=BaselineAgent())
 
1
  from pathlib import Path
2
 
3
  from src.executive_assistant.agent import BaselineAgent
4
+ from src.executive_assistant.models import AssistantAction, PolicyDecision
5
  from src.executive_assistant.training import train_q_learning
6
 
7
 
8
+ def test_app_builds_rl_policy_from_checkpoint(tmp_path, monkeypatch) -> None:
9
+ monkeypatch.setenv("OPENROUTER_API_KEY", "")
10
+ monkeypatch.setenv("OPENAI_API_KEY", "")
11
  from app import _build_policy
12
 
13
  policy, _ = train_q_learning(episodes=12, epsilon=0.1, teacher=BaselineAgent())
 
16
  provider="rl",
17
  checkpoint_path=str(checkpoint),
18
  )
19
+ assert loaded_policy.checkpoint_policy.epsilon == 0.0
20
 
21
 
22
+ def test_app_builds_missing_rl_checkpoint(tmp_path, monkeypatch) -> None:
23
+ monkeypatch.setenv("OPENROUTER_API_KEY", "")
24
+ monkeypatch.setenv("OPENAI_API_KEY", "")
25
  from app import _build_policy
26
 
27
  checkpoint = tmp_path / "missing" / "q_policy.json"
 
29
  provider="rl",
30
  checkpoint_path=str(checkpoint),
31
  )
32
+ assert loaded_policy.checkpoint_policy.epsilon == 0.0
33
  assert checkpoint.exists()
34
 
35
 
36
+ def test_rl_policy_uses_openrouter_model_with_checkpoint_guidance() -> None:
37
+ from app import OpenRouterGuidedCheckpointPolicy
38
+ from src.executive_assistant.env import ExecutiveAssistantEnv
39
+
40
+ class StubModelPolicy:
41
+ def __init__(self) -> None:
42
+ self.observation = None
43
+
44
+ def choose_action(self, task_name, observation):
45
+ self.observation = observation
46
+ return PolicyDecision(
47
+ reasoning="Followed the checkpoint hint.",
48
+ action=AssistantAction(action_type="read_email", target_id=1),
49
+ )
50
+
51
+ q_policy, _ = train_q_learning(episodes=12, epsilon=0.1, teacher=BaselineAgent())
52
+ model_policy = StubModelPolicy()
53
+ policy = OpenRouterGuidedCheckpointPolicy(q_policy, model_policy)
54
+ env = ExecutiveAssistantEnv(task_name="easy_deadline_extraction")
55
+ decision = policy.choose_action("easy_deadline_extraction", env.reset())
56
+ assert decision.action.action_type == "read_email"
57
+ assert "OpenRouter Gemma generated" in decision.reasoning
58
+ assert model_policy.observation is not None
59
+ assert any(
60
+ "Trained RL checkpoint recommendation" in entry
61
+ for entry in model_policy.observation.action_history
62
+ )
63
+
64
+
65
+ def test_app_stepwise_episode_generator_yields_updates(tmp_path, monkeypatch) -> None:
66
+ monkeypatch.setenv("OPENROUTER_API_KEY", "")
67
+ monkeypatch.setenv("OPENAI_API_KEY", "")
68
  from app import run_live_episode
69
 
70
  policy, _ = train_q_learning(episodes=12, epsilon=0.1, teacher=BaselineAgent())