OpenEnv / server /app.py
mahammadaftab's picture
app.py Updated
0678542
"""
OpenEnv Email Triage - Hugging Face Spaces Demo with FastAPI Router
Interactive web interface for testing the Email Triage environment.
Includes a POST /reset endpoint to satisfy automated validation checks.
"""
import gradio as gr
import json
import time
import numpy as np
from pathlib import Path
from fastapi import FastAPI
import uvicorn
from openenv.core.env import OpenEnv
from openenv.core.config import EnvConfig
from openenv.core.grader import create_grader
from openenv.core.models import Action
# Create FastAPI app for the Hackathon validation pings
app = FastAPI()
# Load configuration
CONFIG_PATH = Path("openenv.yaml")
def load_yaml_config():
try:
import yaml
with open(CONFIG_PATH, 'r') as f:
return yaml.safe_load(f)
except:
return None
def get_task_config(task_level: str) -> dict:
yaml_config = load_yaml_config()
if yaml_config and 'tasks' in yaml_config:
return yaml_config['tasks'][task_level]
defaults = {
'easy': {'config': {'num_emails': 10, 'spam_ratio': 0.3, 'urgent_ratio': 0.2, 'confounding_ratio': 0.0},
'grader': {'success_threshold': 0.7, 'criteria': [{'name': 'accuracy', 'weight': 0.8}, {'name': 'critical_safety', 'weight': 0.2}]}},
'medium': {'config': {'num_emails': 20, 'spam_ratio': 0.3, 'urgent_ratio': 0.2, 'confounding_ratio': 0.2},
'grader': {'success_threshold': 0.8, 'criteria': [{'name': 'accuracy', 'weight': 0.7}, {'name': 'critical_safety', 'weight': 0.3}]}},
'hard': {'config': {'num_emails': 50, 'spam_ratio': 0.4, 'urgent_ratio': 0.1, 'confounding_ratio': 0.4},
'grader': {'success_threshold': 0.9, 'criteria': [{'name': 'accuracy', 'weight': 0.6}, {'name': 'critical_safety', 'weight': 0.4}]}},
}
return defaults.get(task_level, defaults['medium'])
# Global environment state for the session
env_instance = None
grader_instance = None
@app.post("/reset")
def rest_api_reset():
return {"status": "success"}
def run_demo_episode(task_level: str = "medium", seed: int = 42):
"""
Run single demo episode and return results.
"""
render_mode = "rgb_array"
# Get configuration
task_config = get_task_config(task_level)
# Create environment
env_config = EnvConfig(
**task_config['config'],
task_level=task_level,
render_mode=render_mode,
verbose=False,
)
try:
env = OpenEnv(config=env_config)
except Exception as e:
import traceback
error_msg = f"Failed to create environment: {str(e)}\n\n{traceback.format_exc()}"
print(error_msg)
# Return placeholder image and error message
placeholder = np.zeros((768, 1024, 3), dtype=np.uint8)
return placeholder, "Error initializing environment", error_msg
# Create grader
grader = create_grader(task_level, task_config['grader'])
# Reset
obs, info = env.reset(seed=seed)
grader.reset()
# Run episode
history = []
total_reward = 0.0
steps = 0
max_steps = 200 # Limit for demo
for step in range(max_steps):
current_idx = env.current_email_index
if current_idx < len(env.emails_queue):
email = env.emails_queue[current_idx]
sender = email.sender
subject = email.subject
else:
break
# Random action for demo (in real use, this would be your agent)
action = env.action_space.sample()
# Take step
obs, reward, terminated, truncated, info = env.step(action)
action_map = {0: "Ignore", 1: "Reply", 2: "Forward", 3: "Archive", 4: "Delete"}
history.append([
sender,
subject,
action_map.get(action, str(action)),
f"{reward:.1f}",
"Yes" if info.get('last_reward', -1) > 0 else "No"
])
# Update grader
grader.update(**info)
total_reward += reward
steps += 1
# Check termination
if terminated or truncated:
break
# Get grade report
grade_report = grader.get_grade_report()
# Generate metrics text
metrics_text = f"""
**Episode Statistics:**
- Steps: {steps}
- Total Reward: {total_reward:.2f}
- Correct Actions: {info.get('correct_actions', 0)}
- Incorrect Actions: {info.get('incorrect_actions', 0)}
- Critical Failures: {info.get('critical_failures', 0)}
""".strip()
# Generate grade text
grade_text = f"""
**Performance Grade: {grade_report['final_score']:.2f} / 1.00**
{grade_report['feedback']}
**Criteria Scores:**
"""
for criterion_name, score in grade_report['criteria_scores'].items():
grade_text += f"\n- {criterion_name.replace('_', ' ').title()}: {score:.2f}"
grade_text += f"\n\n**Status:** {'βœ“ PASSED' if grade_report['passed'] else 'βœ— FAILED'}"
grade_text += f"\nThreshold: {grade_report['success_threshold']:.2f}"
env.close()
return history, metrics_text, grade_text
def compare_all_levels(seed: int = 42):
"""
Run comparison across all difficulty levels.
Args:
seed: Random seed
Returns:
Comparison table text
"""
results = []
for level in ['easy', 'medium', 'hard']:
task_config = get_task_config(level)
env_config = EnvConfig(
**task_config['config'],
task_level=level,
verbose=False,
)
env = OpenEnv(config=env_config)
grader_instance = create_grader(level, task_config['grader'])
obs, _ = env.reset(seed=seed)
grader_instance.reset()
# Run episode
done = False
steps = 0
info = {}
while not done and steps < 300:
action = env.action_space.sample()
obs, reward, terminated, truncated, info = env.step(action)
grader_instance.update(**info)
done = terminated or truncated
steps += 1
grade_report = grader_instance.get_grade_report()
results.append({
'level': level.upper(),
'score': grade_report['final_score'],
'passed': 'βœ“' if grade_report['passed'] else 'βœ—',
'steps': steps,
})
env.close()
# Create comparison table
table = "| Difficulty | Score | Status | Steps |\n"
table += "|------------|-------|--------|-------|\n"
for result in results:
table += f"| {result['level']:10s} | {result['score']:.2f} | {result['passed']:6s} | {result['steps']:5d} |\n"
return table
def create_demo():
with gr.Blocks(title="OpenEnv Email Triage") as demo:
gr.Markdown("# πŸ“§ OpenEnv: Email Triage")
gr.Markdown("Real-world task environment for AI agent training. Classify the inbox accurately and maintain safety limits.")
with gr.Row():
with gr.Column(scale=1):
task_level_dropdown = gr.Dropdown(choices=['easy', 'medium', 'hard'], value='medium', label="Difficulty")
seed_slider = gr.Slider(minimum=0, maximum=1000, value=42, step=1, label="Random Seed")
reset_btn = gr.Button("Initialize Inbox", variant="primary")
run_button = gr.Button("πŸš€ Run Episode", variant="primary")
compare_button = gr.Button("πŸ“Š Compare All Levels")
with gr.Column(scale=3):
gr.Markdown("### πŸ“Ί Environment View")
output_view = gr.Dataframe(
label="Inbox Triage History",
headers=["Sender", "Subject", "Action Taken", "Reward", "Correct?"],
)
with gr.Row():
with gr.Column():
metrics_view = gr.Markdown("### Metrics\nN/A")
with gr.Column():
gr.Markdown("### 🎯 Performance Grade")
grade_output = gr.Textbox(
label="Grade Report",
lines=10,
)
with gr.Row():
gr.Markdown("### πŸ“‹ Level Comparison")
comparison_output = gr.Textbox(
label="Performance Across Difficulty Levels",
lines=8,
)
# Event handlers
run_button.click(
fn=run_demo_episode,
inputs=[task_level_dropdown, seed_slider],
outputs=[output_view, metrics_view, grade_output],
)
compare_button.click(
fn=compare_all_levels,
inputs=[seed_slider],
outputs=[comparison_output],
)
# Auto-run on load
demo.load(
fn=run_demo_episode,
inputs=[task_level_dropdown, seed_slider],
outputs=[output_view, metrics_view, grade_output],
)
gr.Markdown("""
---
**About:** This is a production-ready RL environment for training email triage agents.
**Task:** Accurately classify emails. 0=Ignore, 1=Reply, 2=Forward, 3=Archive, 4=Delete.
**Scoring:** Agents are graded on accuracy and critical safety (e.g. not deleting urgent emails).
[View on GitHub](https://github.com/yourusername/OpenEnv) | [Documentation](https://github.com/yourusername/OpenEnv#readme)
""")
return demo
demo = create_demo()
# Mount the Gradio app onto the FastAPI server
app = gr.mount_gradio_app(app, demo, path="/")
def main():
import uvicorn
# Create and launch demo using uvicorn to serve the FastAPI app (with Gradio mounted)
uvicorn.run(app, host="0.0.0.0", port=7860)
if __name__ == "__main__":
main()