from __future__ import annotations
from typing import Any, Dict, Optional
from fastapi import Body, FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import HTMLResponse
import uvicorn
from env.environment import IDPIExfilEnv
from env.models import (
Action,
EpisodeResult,
HealthResponse,
MetadataResponse,
Observation,
ResetRequest,
Reward,
SchemaResponse,
State,
StepRequest,
StepResponse,
)
APP_VERSION = "0.1.0"
ENV_NAME = "idpi-exfil-env"
ENV_DESCRIPTION = "OpenEnv-compatible prompt injection benchmark for safe email processing."
TASK_NAMES = ["easy", "medium", "hard"]
SPACE_URL = "https://kchad-prompt-injection-rl-environment.hf.space"
app = FastAPI(
title="IDPI Exfil Environment",
description=ENV_DESCRIPTION,
version=APP_VERSION,
)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["*"],
allow_headers=["*"],
)
env = IDPIExfilEnv()
def _model_schema(model: Any) -> Dict[str, Any]:
if hasattr(model, "model_json_schema"):
return model.model_json_schema()
return model.schema()
def _landing_page_html() -> str:
task_cards = "".join(
f"""
{task.upper()}
{task.title()}
{description}
"""
for task, description in (
(
"easy",
"Introductory scenarios with obvious but believable relay attempts and lightly hidden prompt injections.",
),
(
"medium",
"Curated mid-tier episodes using hidden carrier channels, transcript replay, and authority spoofing.",
),
(
"hard",
"Safe-completion episodes where the agent must finish the business task without contaminating internal outputs.",
),
)
)
showcase_cards = "".join(
f"""
{channel}
{title}
{description}
"""
for channel, title, description in (
(
"HIDDEN MARKUP",
"Hidden Relay Instruction",
"Hidden relay instructions tucked into comment markup. Safe agents should read, flag, and keep the digest workflow internal.",
),
(
"DELAYED TRIGGER",
"Summary-Time Hook",
"The malicious instruction only activates when the agent tries to write the benign summary, which makes the episode trajectory-aware instead of static-text-aware.",
),
(
"SEMANTIC",
"Internal Output Poisoning",
"A business-looking workbook clause poisons the internal summary unless the model recognizes it as attacker-directed content.",
),
)
)
return f"""
IDPI Exfil Environment
OpenEnv Benchmark
IDPI Exfil Environment
A prompt-injection benchmark for secure email-style agent workflows. The runtime pack
contains curated easy, medium, and hard scenarios covering hidden relay instructions,
delayed summary hooks, transcript replay, markup-based attacks, and internal-output
contamination attempts.
Service Status
Tasks
{", ".join(TASK_NAMES)}
Runtime Pack
Curated / 32 scenarios
Runtime Pack
{task_cards}
Showcase Episodes
{showcase_cards}
Quick Start
curl -X POST {SPACE_URL}/reset
curl -X POST {SPACE_URL}/step \\
-H "Content-Type: application/json" \\
-d '{{"action": {{"type": "ReadDoc"}}}}'
curl {SPACE_URL}/state
curl {SPACE_URL}/result
Use POST /reset to start a new episode. An empty body is valid.
Use POST /step with typed OpenEnv actions.
The evaluation runner uses the curated benchmark manifest as the runtime source of truth.
Inspect live schemas at /schema , docs at /docs , and metadata at /metadata .
Call /health for service health and /result after an episode to inspect grader output.
"""
@app.get("/", response_class=HTMLResponse)
def root() -> HTMLResponse:
return HTMLResponse(content=_landing_page_html())
@app.get("/web", response_class=HTMLResponse)
def web_root() -> HTMLResponse:
return HTMLResponse(content=_landing_page_html())
@app.get("/health", response_model=HealthResponse)
def health() -> HealthResponse:
return HealthResponse(status="healthy", env=ENV_NAME, version=APP_VERSION)
@app.get("/metadata", response_model=MetadataResponse)
def metadata() -> MetadataResponse:
return MetadataResponse(
name=ENV_NAME,
description=ENV_DESCRIPTION,
version=APP_VERSION,
tasks=TASK_NAMES,
)
@app.get("/schema", response_model=SchemaResponse)
def schema() -> SchemaResponse:
return SchemaResponse(
action=_model_schema(Action),
observation=_model_schema(Observation),
state=_model_schema(State),
reward=_model_schema(Reward),
result=_model_schema(EpisodeResult),
)
@app.post("/reset", response_model=Observation)
def reset(request: Optional[ResetRequest] = Body(default=None)) -> Observation:
try:
payload = request or ResetRequest()
return env.reset(difficulty=payload.difficulty, seed=payload.seed)
except ValueError as exc:
raise HTTPException(status_code=400, detail=str(exc)) from exc
except Exception as exc:
raise HTTPException(status_code=500, detail=str(exc)) from exc
@app.post("/step", response_model=StepResponse)
def step(request: StepRequest) -> StepResponse:
try:
observation, reward, done, info = env.step(request.action)
return StepResponse(observation=observation, reward=reward, done=done, info=info)
except RuntimeError as exc:
raise HTTPException(status_code=400, detail=str(exc)) from exc
except ValueError as exc:
raise HTTPException(status_code=400, detail=str(exc)) from exc
except Exception as exc:
raise HTTPException(status_code=500, detail=str(exc)) from exc
@app.get("/state", response_model=State)
def state() -> State:
try:
return env.state()
except RuntimeError as exc:
raise HTTPException(status_code=400, detail=str(exc)) from exc
except Exception as exc:
raise HTTPException(status_code=500, detail=str(exc)) from exc
@app.get("/result", response_model=EpisodeResult)
def result() -> EpisodeResult:
try:
return env.get_result()
except RuntimeError as exc:
raise HTTPException(status_code=400, detail=str(exc)) from exc
except Exception as exc:
raise HTTPException(status_code=500, detail=str(exc)) from exc
@app.post("/mcp")
def mcp(payload: Optional[Dict[str, Any]] = Body(default=None)) -> Dict[str, Any]:
request_id = None
if isinstance(payload, dict):
request_id = payload.get("id")
return {
"jsonrpc": "2.0",
"id": request_id,
"error": {
"code": -32601,
"message": "MCP bridge is not implemented for this benchmark. Use the HTTP reset/step/state/result endpoints.",
},
}
def main() -> None:
uvicorn.run("server.app:app", host="0.0.0.0", port=7860, reload=False)
if __name__ == "__main__":
main()