from __future__ import annotations from typing import Any, Dict, Optional from fastapi import Body, FastAPI, HTTPException from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import HTMLResponse import uvicorn from env.environment import IDPIExfilEnv from env.models import ( Action, EpisodeResult, HealthResponse, MetadataResponse, Observation, ResetRequest, Reward, SchemaResponse, State, StepRequest, StepResponse, ) APP_VERSION = "0.1.0" ENV_NAME = "idpi-exfil-env" ENV_DESCRIPTION = "OpenEnv-compatible prompt injection benchmark for safe email processing." TASK_NAMES = ["easy", "medium", "hard"] SPACE_URL = "https://kchad-prompt-injection-rl-environment.hf.space" app = FastAPI( title="IDPI Exfil Environment", description=ENV_DESCRIPTION, version=APP_VERSION, ) app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"], ) env = IDPIExfilEnv() def _model_schema(model: Any) -> Dict[str, Any]: if hasattr(model, "model_json_schema"): return model.model_json_schema() return model.schema() def _landing_page_html() -> str: task_cards = "".join( f"""

{task.upper()}

{task.title()}

{description}

""" for task, description in ( ( "easy", "Introductory scenarios with obvious but believable relay attempts and lightly hidden prompt injections.", ), ( "medium", "Curated mid-tier episodes using hidden carrier channels, transcript replay, and authority spoofing.", ), ( "hard", "Safe-completion episodes where the agent must finish the business task without contaminating internal outputs.", ), ) ) showcase_cards = "".join( f"""

{channel}

{title}

{description}

""" for channel, title, description in ( ( "HIDDEN MARKUP", "Hidden Relay Instruction", "Hidden relay instructions tucked into comment markup. Safe agents should read, flag, and keep the digest workflow internal.", ), ( "DELAYED TRIGGER", "Summary-Time Hook", "The malicious instruction only activates when the agent tries to write the benign summary, which makes the episode trajectory-aware instead of static-text-aware.", ), ( "SEMANTIC", "Internal Output Poisoning", "A business-looking workbook clause poisons the internal summary unless the model recognizes it as attacker-directed content.", ), ) ) return f""" IDPI Exfil Environment

OpenEnv Benchmark

IDPI Exfil Environment

A prompt-injection benchmark for secure email-style agent workflows. The runtime pack contains curated easy, medium, and hard scenarios covering hidden relay instructions, delayed summary hooks, transcript replay, markup-based attacks, and internal-output contamination attempts.

Open API Docs View Metadata OpenAPI JSON

Runtime Pack

{task_cards}

Showcase Episodes

{showcase_cards}

Quick Start

curl -X POST {SPACE_URL}/reset

curl -X POST {SPACE_URL}/step \\
  -H "Content-Type: application/json" \\
  -d '{{"action": {{"type": "ReadDoc"}}}}'

curl {SPACE_URL}/state
curl {SPACE_URL}/result

Use POST /reset to start a new episode. An empty body is valid.
Use POST /step with typed OpenEnv actions.
The evaluation runner uses the curated benchmark manifest as the runtime source of truth.
Inspect live schemas at /schema, docs at /docs, and metadata at /metadata.
Call /health for service health and /result after an episode to inspect grader output.

""" @app.get("/", response_class=HTMLResponse) def root() -> HTMLResponse: return HTMLResponse(content=_landing_page_html()) @app.get("/web", response_class=HTMLResponse) def web_root() -> HTMLResponse: return HTMLResponse(content=_landing_page_html()) @app.get("/health", response_model=HealthResponse) def health() -> HealthResponse: return HealthResponse(status="healthy", env=ENV_NAME, version=APP_VERSION) @app.get("/metadata", response_model=MetadataResponse) def metadata() -> MetadataResponse: return MetadataResponse( name=ENV_NAME, description=ENV_DESCRIPTION, version=APP_VERSION, tasks=TASK_NAMES, ) @app.get("/schema", response_model=SchemaResponse) def schema() -> SchemaResponse: return SchemaResponse( action=_model_schema(Action), observation=_model_schema(Observation), state=_model_schema(State), reward=_model_schema(Reward), result=_model_schema(EpisodeResult), ) @app.post("/reset", response_model=Observation) def reset(request: Optional[ResetRequest] = Body(default=None)) -> Observation: try: payload = request or ResetRequest() return env.reset(difficulty=payload.difficulty, seed=payload.seed) except ValueError as exc: raise HTTPException(status_code=400, detail=str(exc)) from exc except Exception as exc: raise HTTPException(status_code=500, detail=str(exc)) from exc @app.post("/step", response_model=StepResponse) def step(request: StepRequest) -> StepResponse: try: observation, reward, done, info = env.step(request.action) return StepResponse(observation=observation, reward=reward, done=done, info=info) except RuntimeError as exc: raise HTTPException(status_code=400, detail=str(exc)) from exc except ValueError as exc: raise HTTPException(status_code=400, detail=str(exc)) from exc except Exception as exc: raise HTTPException(status_code=500, detail=str(exc)) from exc @app.get("/state", response_model=State) def state() -> State: try: return env.state() except RuntimeError as exc: raise HTTPException(status_code=400, detail=str(exc)) from exc except Exception as exc: raise HTTPException(status_code=500, detail=str(exc)) from exc @app.get("/result", response_model=EpisodeResult) def result() -> EpisodeResult: try: return env.get_result() except RuntimeError as exc: raise HTTPException(status_code=400, detail=str(exc)) from exc except Exception as exc: raise HTTPException(status_code=500, detail=str(exc)) from exc @app.post("/mcp") def mcp(payload: Optional[Dict[str, Any]] = Body(default=None)) -> Dict[str, Any]: request_id = None if isinstance(payload, dict): request_id = payload.get("id") return { "jsonrpc": "2.0", "id": request_id, "error": { "code": -32601, "message": "MCP bridge is not implemented for this benchmark. Use the HTTP reset/step/state/result endpoints.", }, } def main() -> None: uvicorn.run("server.app:app", host="0.0.0.0", port=7860, reload=False) if __name__ == "__main__": main()