Spaces:

Flickinshots
/

EmailMaestro

Running

App Files Files Community

Flickinshots commited on 9 days ago

Commit

55f5aac

verified ·

1 Parent(s): 0ae427e

Deploy Project Epsilon Space bundle

Browse files

Files changed (5) hide show

README.md +3 -3
app.py +87 -6
docs/HF_SPACE_README.md +3 -3
src/executive_assistant/deployment.py +3 -3
tests/test_app.py +41 -5

README.md CHANGED Viewed

@@ -65,7 +65,7 @@ To make that possible under hackathon constraints, we replaced live services wit
 - **Environment state:** in-memory SQLite workspace simulating emails, todos, files, and action history
 - **OpenEnv contract:** Pydantic models defining observations, actions, rewards, and policy decisions
 - **Execution loop:** shared `EpisodeRunner` used by tests, scripts, notebook experiments, and the Gradio app
-- **App policies:** deterministic baseline and tabular RL checkpoint replay
 - **Validator inference:** OpenRouter-backed Gemma execution through the OpenAI client compatibility layer
 - **UI layer:** Gradio control room plus visible workspace snapshots for judges
@@ -88,7 +88,7 @@ The environment includes a stakeholder email asking for exact metrics from a loc
 - Live observation payloads
 - Workspace tables for emails, todos, files, and action logs
 - Step-by-step trace rows with reasoning, action type, status, score, and done state
-- Differences between `baseline` and the bundled `rl` checkpoint policy
 ## Runtime And Deployment Notes
@@ -106,7 +106,7 @@ The environment includes a stakeholder email asking for exact metrics from a loc
 1. Open the Space and choose one of the seeded scenarios.
 2. Run the deterministic `baseline` policy for a guaranteed reference trace.
-3. Switch to `rl` to replay the bundled learned checkpoint.
 4. Compare how the workspace mutates after each step instead of evaluating only the final response.
 5. Use the root `inference.py` path for OpenRouter-backed Gemma evaluation when the validator runs model inference.

 - **Environment state:** in-memory SQLite workspace simulating emails, todos, files, and action history
 - **OpenEnv contract:** Pydantic models defining observations, actions, rewards, and policy decisions
 - **Execution loop:** shared `EpisodeRunner` used by tests, scripts, notebook experiments, and the Gradio app
+- **App policies:** deterministic baseline and an RL mode where OpenRouter Gemma generates actions using the tabular RL checkpoint recommendation as guidance
 - **Validator inference:** OpenRouter-backed Gemma execution through the OpenAI client compatibility layer
 - **UI layer:** Gradio control room plus visible workspace snapshots for judges
 - Live observation payloads
 - Workspace tables for emails, todos, files, and action logs
 - Step-by-step trace rows with reasoning, action type, status, score, and done state
+- Differences between `baseline` and the OpenRouter-guided bundled `rl` checkpoint policy
 ## Runtime And Deployment Notes
 1. Open the Space and choose one of the seeded scenarios.
 2. Run the deterministic `baseline` policy for a guaranteed reference trace.
+3. Switch to `rl` so Gemma receives the learned checkpoint recommendation and generates the runtime action.
 4. Compare how the workspace mutates after each step instead of evaluating only the final response.
 5. Use the root `inference.py` path for OpenRouter-backed Gemma evaluation when the validator runs model inference.

app.py CHANGED Viewed

@@ -1,15 +1,17 @@
 from __future__ import annotations
 import json
 import time
 import uuid
 from html import escape
 import gradio as gr
-from src.executive_assistant.agent import BaselineAgent
-from src.executive_assistant.config import AppRuntimeConfig, load_env_file
 from src.executive_assistant.env import ExecutiveAssistantEnv
 from src.executive_assistant.runner import EpisodeRunner
 from src.executive_assistant.training import QLearningPolicy, default_checkpoint_path, train_q_learning
@@ -758,6 +760,79 @@ def _ensure_rl_checkpoint(checkpoint_path: str) -> str:
     return str(saved_path)
 def _build_policy(
     provider: str,
     checkpoint_path: str,
@@ -765,7 +840,13 @@ def _build_policy(
     if provider == "baseline":
         return BaselineAgent()
     if provider == "rl":
-        return QLearningPolicy.load(_ensure_rl_checkpoint(checkpoint_path or _default_rl_checkpoint()))
     raise ValueError(f"Unsupported app policy provider: {provider}")
@@ -926,7 +1007,7 @@ with gr.Blocks(title="Autonomous Executive Assistant Sandbox") as demo:
                   <h1>Executive Assistant Sandbox</h1>
                   <p>
                     Run the exact same episode loop used in training, inspect each workspace mutation in real time,
-                    and compare the deterministic baseline against the trained RL checkpoint without losing the structure of the task.
                   </p>
                   <div class="hero-strip">
                     <div class="hero-pill">Shared EpisodeRunner path</div>
@@ -954,7 +1035,7 @@ with gr.Blocks(title="Autonomous Executive Assistant Sandbox") as demo:
                         """
                         <h2 class="panel-title">Control Room</h2>
                         <p class="panel-copy">
-                          Pick a scenario, choose baseline or the trained RL JSON checkpoint, and run a stepwise episode against the same environment used by training and evaluation.
                         </p>
                         """
                     )
@@ -992,7 +1073,7 @@ with gr.Blocks(title="Autonomous Executive Assistant Sandbox") as demo:
                     gr.HTML(
                         """
                         <p class="footnote">
-                          The RL policy always replays a trained JSON checkpoint. OpenRouter is reserved for the separate validator-facing inference script, not the live app policy controls.
                         </p>
                         """
                     )

 from __future__ import annotations
 import json
+import os
 import time
 import uuid
 from html import escape
 import gradio as gr
+from src.executive_assistant.agent import BaselineAgent, OpenRouterPolicy
+from src.executive_assistant.config import AppRuntimeConfig, OpenRouterConfig, load_env_file
 from src.executive_assistant.env import ExecutiveAssistantEnv
+from src.executive_assistant.models import PolicyDecision, WorkspaceObservation
 from src.executive_assistant.runner import EpisodeRunner
 from src.executive_assistant.training import QLearningPolicy, default_checkpoint_path, train_q_learning
     return str(saved_path)
+class OpenRouterGuidedCheckpointPolicy:
+    def __init__(
+        self,
+        checkpoint_policy: QLearningPolicy,
+        model_policy: OpenRouterPolicy | None,
+    ) -> None:
+        self.checkpoint_policy = checkpoint_policy
+        self.model_policy = model_policy
+    def choose_action(self, task_name: str, observation: WorkspaceObservation) -> PolicyDecision:
+        checkpoint_decision = self.checkpoint_policy.choose_action(task_name, observation)
+        if self.model_policy is None:
+            return PolicyDecision(
+                reasoning=(
+                    "OpenRouter model is not configured; using the trained RL checkpoint action. "
+                    f"{checkpoint_decision.reasoning}"
+                ),
+                action=checkpoint_decision.action,
+            )
+        guided_observation = observation.model_copy(
+            update={
+                "action_history": observation.action_history
+                + [
+                    (
+                        "Trained RL checkpoint recommendation: "
+                        f"reasoning={checkpoint_decision.reasoning}; "
+                        f"action={checkpoint_decision.action.model_dump()}"
+                    )
+                ]
+            }
+        )
+        try:
+            model_decision = self.model_policy.choose_action(task_name, guided_observation)
+        except Exception as exc:
+            return PolicyDecision(
+                reasoning=(
+                    f"OpenRouter model call failed ({exc}); using the trained RL checkpoint action. "
+                    f"{checkpoint_decision.reasoning}"
+                ),
+                action=checkpoint_decision.action,
+            )
+        return PolicyDecision(
+            reasoning=(
+                "OpenRouter Gemma generated this action using the trained RL checkpoint recommendation. "
+                f"Model reasoning: {model_decision.reasoning} | Checkpoint recommendation: "
+                f"{checkpoint_decision.reasoning}"
+            ),
+            action=model_decision.action,
+        )
+def _build_openrouter_policy() -> OpenRouterPolicy | None:
+    api_key = os.environ.get("OPENROUTER_API_KEY", "").strip() or os.environ.get(
+        "OPENAI_API_KEY",
+        "",
+    ).strip()
+    if not api_key:
+        return None
+    config = OpenRouterConfig(
+        api_key=api_key,
+        base_url=os.environ.get("OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1"),
+        model_name=os.environ.get("OPENROUTER_MODEL", "google/gemma-4-31b-it"),
+        site_url=os.environ.get("OPENROUTER_SITE_URL", "http://localhost:7860"),
+        app_name=os.environ.get(
+            "OPENROUTER_APP_NAME",
+            "EmailMaestro | Executive Assistant Sandbox",
+        ),
+        temperature=float(os.environ.get("OPENROUTER_TEMPERATURE", "0.1")),
+        max_tokens=int(os.environ.get("OPENROUTER_MAX_TOKENS", "600")),
+    )
+    return OpenRouterPolicy(config=config)
 def _build_policy(
     provider: str,
     checkpoint_path: str,
     if provider == "baseline":
         return BaselineAgent()
     if provider == "rl":
+        checkpoint_policy = QLearningPolicy.load(
+            _ensure_rl_checkpoint(checkpoint_path or _default_rl_checkpoint())
+        )
+        return OpenRouterGuidedCheckpointPolicy(
+            checkpoint_policy=checkpoint_policy,
+            model_policy=_build_openrouter_policy(),
+        )
     raise ValueError(f"Unsupported app policy provider: {provider}")
                   <h1>Executive Assistant Sandbox</h1>
                   <p>
                     Run the exact same episode loop used in training, inspect each workspace mutation in real time,
+                    and compare the deterministic baseline against the OpenRouter-guided RL checkpoint without losing the structure of the task.
                   </p>
                   <div class="hero-strip">
                     <div class="hero-pill">Shared EpisodeRunner path</div>
                         """
                         <h2 class="panel-title">Control Room</h2>
                         <p class="panel-copy">
+                          Pick a scenario, choose baseline or the OpenRouter-guided trained RL JSON checkpoint, and run a stepwise episode against the same environment used by training and evaluation.
                         </p>
                         """
                     )
                     gr.HTML(
                         """
                         <p class="footnote">
+                          The RL policy loads the trained JSON checkpoint as guidance, then asks OpenRouter Gemma through the OpenAI client to generate the runtime action. If the model call fails, it falls back to the checkpoint action.
                         </p>
                         """
                     )

docs/HF_SPACE_README.md CHANGED Viewed

@@ -65,7 +65,7 @@ To make that possible under hackathon constraints, we replaced live services wit
 - **Environment state:** in-memory SQLite workspace simulating emails, todos, files, and action history
 - **OpenEnv contract:** Pydantic models defining observations, actions, rewards, and policy decisions
 - **Execution loop:** shared `EpisodeRunner` used by tests, scripts, notebook experiments, and the Gradio app
-- **App policies:** deterministic baseline and tabular RL checkpoint replay
 - **Validator inference:** OpenRouter-backed Gemma execution through the OpenAI client compatibility layer
 - **UI layer:** Gradio control room plus visible workspace snapshots for judges
@@ -88,7 +88,7 @@ The environment includes a stakeholder email asking for exact metrics from a loc
 - Live observation payloads
 - Workspace tables for emails, todos, files, and action logs
 - Step-by-step trace rows with reasoning, action type, status, score, and done state
-- Differences between `baseline` and the bundled `rl` checkpoint policy
 ## Runtime And Deployment Notes
@@ -105,7 +105,7 @@ The environment includes a stakeholder email asking for exact metrics from a loc
 1. Open the Space and choose one of the seeded scenarios.
 2. Run the deterministic `baseline` policy for a guaranteed reference trace.
-3. Switch to `rl` to replay the bundled learned checkpoint.
 4. Compare how the workspace mutates after each step instead of evaluating only the final response.
 5. Use the root `inference.py` path for OpenRouter-backed Gemma evaluation when the validator runs model inference.

 - **Environment state:** in-memory SQLite workspace simulating emails, todos, files, and action history
 - **OpenEnv contract:** Pydantic models defining observations, actions, rewards, and policy decisions
 - **Execution loop:** shared `EpisodeRunner` used by tests, scripts, notebook experiments, and the Gradio app
+- **App policies:** deterministic baseline and an RL mode where OpenRouter Gemma generates actions using the tabular RL checkpoint recommendation as guidance
 - **Validator inference:** OpenRouter-backed Gemma execution through the OpenAI client compatibility layer
 - **UI layer:** Gradio control room plus visible workspace snapshots for judges
 - Live observation payloads
 - Workspace tables for emails, todos, files, and action logs
 - Step-by-step trace rows with reasoning, action type, status, score, and done state
+- Differences between `baseline` and the OpenRouter-guided bundled `rl` checkpoint policy
 ## Runtime And Deployment Notes
 1. Open the Space and choose one of the seeded scenarios.
 2. Run the deterministic `baseline` policy for a guaranteed reference trace.
+3. Switch to `rl` so Gemma receives the learned checkpoint recommendation and generates the runtime action.
 4. Compare how the workspace mutates after each step instead of evaluating only the final response.
 5. Use the root `inference.py` path for OpenRouter-backed Gemma evaluation when the validator runs model inference.

src/executive_assistant/deployment.py CHANGED Viewed

@@ -155,7 +155,7 @@ To make that possible under hackathon constraints, we replaced live services wit
 - **Environment state:** in-memory SQLite workspace simulating emails, todos, files, and action history
 - **OpenEnv contract:** Pydantic models defining observations, actions, rewards, and policy decisions
 - **Execution loop:** shared `EpisodeRunner` used by tests, scripts, notebook experiments, and the Gradio app
-- **App policies:** deterministic baseline and tabular RL checkpoint replay
 - **Validator inference:** OpenRouter-backed Gemma execution through the OpenAI client compatibility layer
 - **UI layer:** Gradio control room plus visible workspace snapshots for judges
@@ -178,7 +178,7 @@ The environment includes a stakeholder email asking for exact metrics from a loc
 - Live observation payloads
 - Workspace tables for emails, todos, files, and action logs
 - Step-by-step trace rows with reasoning, action type, status, score, and done state
-- Differences between `baseline` and the bundled `rl` checkpoint policy
 ## Runtime And Deployment Notes
@@ -196,7 +196,7 @@ The environment includes a stakeholder email asking for exact metrics from a loc
 1. Open the Space and choose one of the seeded scenarios.
 2. Run the deterministic `baseline` policy for a guaranteed reference trace.
-3. Switch to `rl` to replay the bundled learned checkpoint.
 4. Compare how the workspace mutates after each step instead of evaluating only the final response.
 5. Use the root `inference.py` path for OpenRouter-backed Gemma evaluation when the validator runs model inference.

 - **Environment state:** in-memory SQLite workspace simulating emails, todos, files, and action history
 - **OpenEnv contract:** Pydantic models defining observations, actions, rewards, and policy decisions
 - **Execution loop:** shared `EpisodeRunner` used by tests, scripts, notebook experiments, and the Gradio app
+- **App policies:** deterministic baseline and an RL mode where OpenRouter Gemma generates actions using the tabular RL checkpoint recommendation as guidance
 - **Validator inference:** OpenRouter-backed Gemma execution through the OpenAI client compatibility layer
 - **UI layer:** Gradio control room plus visible workspace snapshots for judges
 - Live observation payloads
 - Workspace tables for emails, todos, files, and action logs
 - Step-by-step trace rows with reasoning, action type, status, score, and done state
+- Differences between `baseline` and the OpenRouter-guided bundled `rl` checkpoint policy
 ## Runtime And Deployment Notes
 1. Open the Space and choose one of the seeded scenarios.
 2. Run the deterministic `baseline` policy for a guaranteed reference trace.
+3. Switch to `rl` so Gemma receives the learned checkpoint recommendation and generates the runtime action.
 4. Compare how the workspace mutates after each step instead of evaluating only the final response.
 5. Use the root `inference.py` path for OpenRouter-backed Gemma evaluation when the validator runs model inference.

tests/test_app.py CHANGED Viewed

@@ -1,10 +1,13 @@
 from pathlib import Path
 from src.executive_assistant.agent import BaselineAgent
 from src.executive_assistant.training import train_q_learning
-def test_app_builds_rl_policy_from_checkpoint(tmp_path) -> None:
     from app import _build_policy
     policy, _ = train_q_learning(episodes=12, epsilon=0.1, teacher=BaselineAgent())
@@ -13,10 +16,12 @@ def test_app_builds_rl_policy_from_checkpoint(tmp_path) -> None:
         provider="rl",
         checkpoint_path=str(checkpoint),
     )
-    assert loaded_policy.epsilon == 0.0
-def test_app_builds_missing_rl_checkpoint(tmp_path) -> None:
     from app import _build_policy
     checkpoint = tmp_path / "missing" / "q_policy.json"
@@ -24,11 +29,42 @@ def test_app_builds_missing_rl_checkpoint(tmp_path) -> None:
         provider="rl",
         checkpoint_path=str(checkpoint),
     )
-    assert loaded_policy.epsilon == 0.0
     assert checkpoint.exists()
-def test_app_stepwise_episode_generator_yields_updates(tmp_path) -> None:
     from app import run_live_episode
     policy, _ = train_q_learning(episodes=12, epsilon=0.1, teacher=BaselineAgent())

 from pathlib import Path
 from src.executive_assistant.agent import BaselineAgent
+from src.executive_assistant.models import AssistantAction, PolicyDecision
 from src.executive_assistant.training import train_q_learning
+def test_app_builds_rl_policy_from_checkpoint(tmp_path, monkeypatch) -> None:
+    monkeypatch.setenv("OPENROUTER_API_KEY", "")
+    monkeypatch.setenv("OPENAI_API_KEY", "")
     from app import _build_policy
     policy, _ = train_q_learning(episodes=12, epsilon=0.1, teacher=BaselineAgent())
         provider="rl",
         checkpoint_path=str(checkpoint),
     )
+    assert loaded_policy.checkpoint_policy.epsilon == 0.0
+def test_app_builds_missing_rl_checkpoint(tmp_path, monkeypatch) -> None:
+    monkeypatch.setenv("OPENROUTER_API_KEY", "")
+    monkeypatch.setenv("OPENAI_API_KEY", "")
     from app import _build_policy
     checkpoint = tmp_path / "missing" / "q_policy.json"
         provider="rl",
         checkpoint_path=str(checkpoint),
     )
+    assert loaded_policy.checkpoint_policy.epsilon == 0.0
     assert checkpoint.exists()
+def test_rl_policy_uses_openrouter_model_with_checkpoint_guidance() -> None:
+    from app import OpenRouterGuidedCheckpointPolicy
+    from src.executive_assistant.env import ExecutiveAssistantEnv
+    class StubModelPolicy:
+        def __init__(self) -> None:
+            self.observation = None
+        def choose_action(self, task_name, observation):
+            self.observation = observation
+            return PolicyDecision(
+                reasoning="Followed the checkpoint hint.",
+                action=AssistantAction(action_type="read_email", target_id=1),
+            )
+    q_policy, _ = train_q_learning(episodes=12, epsilon=0.1, teacher=BaselineAgent())
+    model_policy = StubModelPolicy()
+    policy = OpenRouterGuidedCheckpointPolicy(q_policy, model_policy)
+    env = ExecutiveAssistantEnv(task_name="easy_deadline_extraction")
+    decision = policy.choose_action("easy_deadline_extraction", env.reset())
+    assert decision.action.action_type == "read_email"
+    assert "OpenRouter Gemma generated" in decision.reasoning
+    assert model_policy.observation is not None
+    assert any(
+        "Trained RL checkpoint recommendation" in entry
+        for entry in model_policy.observation.action_history
+    )
+def test_app_stepwise_episode_generator_yields_updates(tmp_path, monkeypatch) -> None:
+    monkeypatch.setenv("OPENROUTER_API_KEY", "")
+    monkeypatch.setenv("OPENAI_API_KEY", "")
     from app import run_live_episode
     policy, _ = train_q_learning(episodes=12, epsilon=0.1, teacher=BaselineAgent())