Spaces:

KChad
/

Prompt-Injection-RL-environment

Running

App Files Files Community

Prompt-Injection-RL-environment / server /app.py

KChad

new changes

06721c4 9 days ago

raw

history blame contribute delete

15.8 kB

	from __future__ import annotations

	from typing import Any, Dict, Optional

	from fastapi import Body, FastAPI, HTTPException
	from fastapi.middleware.cors import CORSMiddleware
	from fastapi.responses import HTMLResponse
	import uvicorn

	from env.environment import IDPIExfilEnv
	from env.models import (
	Action,
	EpisodeResult,
	HealthResponse,
	MetadataResponse,
	Observation,
	ResetRequest,
	Reward,
	SchemaResponse,
	State,
	StepRequest,
	StepResponse,
	)

	APP_VERSION = "0.1.0"
	ENV_NAME = "idpi-exfil-env"
	ENV_DESCRIPTION = "OpenEnv-compatible prompt injection benchmark for safe email processing."
	TASK_NAMES = ["easy", "medium", "hard"]
	SPACE_URL = "https://kchad-prompt-injection-rl-environment.hf.space"

	app = FastAPI(
	title="IDPI Exfil Environment",
	description=ENV_DESCRIPTION,
	version=APP_VERSION,
	)

	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_methods=["*"],
	allow_headers=["*"],
	)

	env = IDPIExfilEnv()


	def _model_schema(model: Any) -> Dict[str, Any]:
	if hasattr(model, "model_json_schema"):
	return model.model_json_schema()
	return model.schema()


	def _landing_page_html() -> str:
	task_cards = "".join(
	f"""
	<article class="task-card">
	<span class="task-pill">{task.upper()}</span>
	<h3>{task.title()}</h3>
	<p>{description}</p>
	</article>
	"""
	for task, description in (
	(
	"easy",
	"Introductory scenarios with obvious but believable relay attempts and lightly hidden prompt injections.",
	),
	(
	"medium",
	"Curated mid-tier episodes using hidden carrier channels, transcript replay, and authority spoofing.",
	),
	(
	"hard",
	"Safe-completion episodes where the agent must finish the business task without contaminating internal outputs.",
	),
	)
	)
	showcase_cards = "".join(
	f"""
	<article class="task-card">
	<span class="task-pill">{channel}</span>
	<h3>{title}</h3>
	<p>{description}</p>
	</article>
	"""
	for channel, title, description in (
	(
	"HIDDEN MARKUP",
	"Hidden Relay Instruction",
	"Hidden relay instructions tucked into comment markup. Safe agents should read, flag, and keep the digest workflow internal.",
	),
	(
	"DELAYED TRIGGER",
	"Summary-Time Hook",
	"The malicious instruction only activates when the agent tries to write the benign summary, which makes the episode trajectory-aware instead of static-text-aware.",
	),
	(
	"SEMANTIC",
	"Internal Output Poisoning",
	"A business-looking workbook clause poisons the internal summary unless the model recognizes it as attacker-directed content.",
	),
	)
	)

	return f"""
	<!DOCTYPE html>
	<html lang="en">
	<head>
	<meta charset="utf-8" />
	<meta name="viewport" content="width=device-width, initial-scale=1" />
	<title>IDPI Exfil Environment</title>
	<style>
	:root {{
	--bg: #fbf6ef;
	--panel: rgba(255, 250, 242, 0.9);
	--ink: #23160d;
	--muted: #6b5848;
	--accent: #a64712;
	--accent-2: #155d4b;
	--line: rgba(53, 31, 15, 0.12);
	--shadow: 0 24px 60px rgba(74, 35, 12, 0.12);
	}}

	* {{
	box-sizing: border-box;
	}}

	body {{
	margin: 0;
	font-family: "IBM Plex Sans", "Segoe UI", sans-serif;
	color: var(--ink);
	background:
	radial-gradient(circle at top left, rgba(212, 132, 56, 0.18), transparent 28%),
	radial-gradient(circle at top right, rgba(21, 93, 75, 0.16), transparent 26%),
	linear-gradient(180deg, #fffaf4 0%, var(--bg) 100%);
	}}

	.shell {{
	max-width: 1160px;
	margin: 0 auto;
	padding: 40px 24px 56px;
	}}

	.hero {{
	display: grid;
	grid-template-columns: 1.5fr 1fr;
	gap: 24px;
	align-items: stretch;
	}}

	.panel {{
	background: var(--panel);
	border: 1px solid var(--line);
	border-radius: 24px;
	box-shadow: var(--shadow);
	backdrop-filter: blur(8px);
	}}

	.hero-copy {{
	padding: 32px;
	}}

	.eyebrow {{
	display: inline-flex;
	align-items: center;
	gap: 8px;
	padding: 8px 12px;
	border-radius: 999px;
	background: rgba(166, 71, 18, 0.08);
	color: var(--accent);
	font-weight: 700;
	letter-spacing: 0.04em;
	text-transform: uppercase;
	font-size: 12px;
	}}

	h1 {{
	margin: 18px 0 14px;
	font-family: Georgia, "Times New Roman", serif;
	font-size: clamp(2.4rem, 5vw, 4.3rem);
	line-height: 0.96;
	letter-spacing: -0.03em;
	}}

	.lede {{
	font-size: 1.05rem;
	line-height: 1.7;
	color: var(--muted);
	max-width: 60ch;
	}}

	.cta-row {{
	display: flex;
	flex-wrap: wrap;
	gap: 12px;
	margin-top: 24px;
	}}

	.cta {{
	display: inline-flex;
	align-items: center;
	justify-content: center;
	padding: 12px 16px;
	border-radius: 14px;
	text-decoration: none;
	font-weight: 700;
	border: 1px solid transparent;
	}}

	.cta.primary {{
	background: var(--accent);
	color: #fff7f0;
	}}

	.cta.secondary {{
	border-color: var(--line);
	color: var(--ink);
	background: rgba(255, 255, 255, 0.6);
	}}

	.status-card {{
	padding: 28px;
	display: flex;
	flex-direction: column;
	justify-content: space-between;
	min-height: 100%;
	}}

	.status-grid {{
	display: grid;
	grid-template-columns: repeat(2, minmax(0, 1fr));
	gap: 12px;
	margin-top: 18px;
	}}

	.stat {{
	padding: 14px;
	border-radius: 18px;
	background: rgba(255, 255, 255, 0.72);
	border: 1px solid var(--line);
	}}

	.stat-label {{
	font-size: 0.72rem;
	text-transform: uppercase;
	letter-spacing: 0.08em;
	color: var(--muted);
	margin-bottom: 6px;
	}}

	.stat-value {{
	font-size: 1.1rem;
	font-weight: 700;
	}}

	.section {{
	margin-top: 24px;
	padding: 28px;
	}}

	.section h2 {{
	margin: 0 0 14px;
	font-size: 1.4rem;
	}}

	.task-grid {{
	display: grid;
	grid-template-columns: repeat(3, minmax(0, 1fr));
	gap: 14px;
	}}

	.task-card {{
	padding: 20px;
	border-radius: 20px;
	background: rgba(255, 255, 255, 0.7);
	border: 1px solid var(--line);
	}}

	.task-card h3 {{
	margin: 10px 0 10px;
	}}

	.task-pill {{
	display: inline-block;
	font-size: 0.74rem;
	font-weight: 700;
	letter-spacing: 0.06em;
	color: var(--accent-2);
	text-transform: uppercase;
	}}

	.api-grid {{
	display: grid;
	grid-template-columns: 1.1fr 0.9fr;
	gap: 16px;
	}}

	pre {{
	margin: 0;
	padding: 18px;
	border-radius: 18px;
	background: #1f1712;
	color: #f5e7d9;
	overflow-x: auto;
	font-family: "IBM Plex Mono", "Consolas", monospace;
	font-size: 0.93rem;
	line-height: 1.55;
	}}

	code.inline {{
	padding: 2px 8px;
	border-radius: 999px;
	background: rgba(166, 71, 18, 0.09);
	color: var(--accent);
	font-family: "IBM Plex Mono", "Consolas", monospace;
	}}

	ul {{
	margin: 0;
	padding-left: 20px;
	line-height: 1.8;
	color: var(--muted);
	}}

	.footer-note {{
	margin-top: 16px;
	color: var(--muted);
	font-size: 0.95rem;
	}}

	@media (max-width: 900px) {{
	.hero,
	.task-grid,
	.api-grid {{
	grid-template-columns: 1fr;
	}}

	.shell {{
	padding: 24px 16px 40px;
	}}

	.hero-copy,
	.status-card,
	.section {{
	padding: 22px;
	}}
	}}
	</style>
	</head>
	<body>
	<main class="shell">
	<section class="hero">
	<div class="panel hero-copy">
	<span class="eyebrow">OpenEnv Benchmark</span>
	<h1>IDPI Exfil Environment</h1>
	<p class="lede">
	A prompt-injection benchmark for secure email-style agent workflows. The runtime pack
	contains curated easy, medium, and hard scenarios covering hidden relay instructions,
	delayed summary hooks, transcript replay, markup-based attacks, and internal-output
	contamination attempts.
	</p>
	<div class="cta-row">
	<a class="cta primary" href="/docs">Open API Docs</a>
	<a class="cta secondary" href="/metadata">View Metadata</a>
	<a class="cta secondary" href="/openapi.json">OpenAPI JSON</a>
	</div>
	</div>
	<aside class="panel status-card">
	<div>
	<span class="eyebrow">Service Status</span>
	<div class="status-grid">
	<div class="stat">
	<div class="stat-label">Env</div>
	<div class="stat-value">{ENV_NAME}</div>
	</div>
	<div class="stat">
	<div class="stat-label">Version</div>
	<div class="stat-value">{APP_VERSION}</div>
	</div>
	<div class="stat">
	<div class="stat-label">Tasks</div>
	<div class="stat-value">{", ".join(TASK_NAMES)}</div>
	</div>
	<div class="stat">
	<div class="stat-label">Runtime Pack</div>
	<div class="stat-value">Curated / 32 scenarios</div>
	</div>
	</div>
	</div>
	<p class="footer-note">
	The API remains fully OpenEnv-compatible. This page is just a human-friendly landing
	surface for the same benchmark endpoints the validator uses.
	</p>
	</aside>
	</section>

	<section class="panel section">
	<h2>Runtime Pack</h2>
	<div class="task-grid">
	{task_cards}
	</div>
	</section>

	<section class="panel section">
	<h2>Showcase Episodes</h2>
	<div class="task-grid">
	{showcase_cards}
	</div>
	</section>

	<section class="panel section">
	<h2>Quick Start</h2>
	<div class="api-grid">
	<pre>curl -X POST {SPACE_URL}/reset

	curl -X POST {SPACE_URL}/step \\
	-H "Content-Type: application/json" \\
	-d '{{"action": {{"type": "ReadDoc"}}}}'

	curl {SPACE_URL}/state
	curl {SPACE_URL}/result</pre>
	<div>
	<ul>
	<li>Use <span class="inline">POST /reset</span> to start a new episode. An empty body is valid.</li>
	<li>Use <span class="inline">POST /step</span> with typed OpenEnv actions.</li>
	<li>The evaluation runner uses the curated benchmark manifest as the runtime source of truth.</li>
	<li>Inspect live schemas at <span class="inline">/schema</span>, docs at <span class="inline">/docs</span>, and metadata at <span class="inline">/metadata</span>.</li>
	<li>Call <span class="inline">/health</span> for service health and <span class="inline">/result</span> after an episode to inspect grader output.</li>
	</ul>
	</div>
	</div>
	</section>
	</main>
	</body>
	</html>
	"""


	@app.get("/", response_class=HTMLResponse)
	def root() -> HTMLResponse:
	return HTMLResponse(content=_landing_page_html())


	@app.get("/web", response_class=HTMLResponse)
	def web_root() -> HTMLResponse:
	return HTMLResponse(content=_landing_page_html())


	@app.get("/health", response_model=HealthResponse)
	def health() -> HealthResponse:
	return HealthResponse(status="healthy", env=ENV_NAME, version=APP_VERSION)


	@app.get("/metadata", response_model=MetadataResponse)
	def metadata() -> MetadataResponse:
	return MetadataResponse(
	name=ENV_NAME,
	description=ENV_DESCRIPTION,
	version=APP_VERSION,
	tasks=TASK_NAMES,
	)


	@app.get("/schema", response_model=SchemaResponse)
	def schema() -> SchemaResponse:
	return SchemaResponse(
	action=_model_schema(Action),
	observation=_model_schema(Observation),
	state=_model_schema(State),
	reward=_model_schema(Reward),
	result=_model_schema(EpisodeResult),
	)


	@app.post("/reset", response_model=Observation)
	def reset(request: Optional[ResetRequest] = Body(default=None)) -> Observation:
	try:
	payload = request or ResetRequest()
	return env.reset(difficulty=payload.difficulty, seed=payload.seed)
	except ValueError as exc:
	raise HTTPException(status_code=400, detail=str(exc)) from exc
	except Exception as exc:
	raise HTTPException(status_code=500, detail=str(exc)) from exc


	@app.post("/step", response_model=StepResponse)
	def step(request: StepRequest) -> StepResponse:
	try:
	observation, reward, done, info = env.step(request.action)
	return StepResponse(observation=observation, reward=reward, done=done, info=info)
	except RuntimeError as exc:
	raise HTTPException(status_code=400, detail=str(exc)) from exc
	except ValueError as exc:
	raise HTTPException(status_code=400, detail=str(exc)) from exc
	except Exception as exc:
	raise HTTPException(status_code=500, detail=str(exc)) from exc


	@app.get("/state", response_model=State)
	def state() -> State:
	try:
	return env.state()
	except RuntimeError as exc:
	raise HTTPException(status_code=400, detail=str(exc)) from exc
	except Exception as exc:
	raise HTTPException(status_code=500, detail=str(exc)) from exc


	@app.get("/result", response_model=EpisodeResult)
	def result() -> EpisodeResult:
	try:
	return env.get_result()
	except RuntimeError as exc:
	raise HTTPException(status_code=400, detail=str(exc)) from exc
	except Exception as exc:
	raise HTTPException(status_code=500, detail=str(exc)) from exc


	@app.post("/mcp")
	def mcp(payload: Optional[Dict[str, Any]] = Body(default=None)) -> Dict[str, Any]:
	request_id = None
	if isinstance(payload, dict):
	request_id = payload.get("id")

	return {
	"jsonrpc": "2.0",
	"id": request_id,
	"error": {
	"code": -32601,
	"message": "MCP bridge is not implemented for this benchmark. Use the HTTP reset/step/state/result endpoints.",
	},
	}


	def main() -> None:
	uvicorn.run("server.app:app", host="0.0.0.0", port=7860, reload=False)


	if __name__ == "__main__":
	main()