Spaces:

Timusgeorge
/

SynthAudit-Env

Sleeping

App Files Files Community

SynthAudit-Env / app.py

Timusgeorge

Fix graph visibility: bigger text, higher contrast, boxed annotations

4ce33b7 verified 14 days ago

raw

history blame contribute delete

25.7 kB

	"""
	SynthAudit.Env — HuggingFace Space Dashboard (200-Step GRPO)
	Premium Medical AI Oversight Interface
	"""

	import gradio as gr
	import numpy as np

	# ─── 200-Step GRPO Training Data (REAL from trainer_state.json) ───
	REWARDS_200 = [
	0.184,0.1201,0.1201,0.0333,0.1145,0.1035,0.244,0.1729,0.1007,0.1063,
	0.1174,0.3363,0.18,0.1736,0.2347,0.0333,0.1063,0.0416,0.1174,0.2712,
	0.2014,0.1736,0.1736,0.1174,0.0444,0.1763,0.1792,0.2069,0.1736,0.1673,
	0.2014,0.2018,0.3584,0.1856,0.2347,0.1991,0.193,0.1229,0.2513,0.2201,
	0.2347,0.0333,0.1645,0.1736,0.2597,0.2708,0.2485,0.2014,0.1847,0.1847,
	0.2907,0.1063,0.1903,0.1736,0.1945,0.1173,0.1063,0.293,0.2847,0.2763,
	0.1173,0.2347,0.2145,0.3002,0.1145,0.1035,0.2569,0.1173,0.2996,0.2903,
	0.3751,0.0333,0.2347,0.1903,0.1146,0.0333,0.109,0.3341,0.2224,0.2347,
	0.2702,0.1812,0.1903,0.2224,0.3013,0.1903,0.1118,0.1646,0.179,0.2375,
	0.209,0.3885,0.2796,0.2846,0.1145,0.2903,0.1903,0.1763,0.1007,0.1736,
	0.2168,0.2435,0.2146,0.2958,0.263,0.1903,0.3647,0.2569,0.1257,0.0333,
	0.2501,0.2907,0.2173,0.2935,0.3485,0.3264,0.368,0.1007,0.1201,0.109,
	0.3207,0.2324,0.2542,0.2946,0.3514,0.2597,0.399,0.4013,0.3701,0.4363,
	0.025,0.0333,0.368,0.0333,0.1958,0.3046,0.3208,0.2401,0.3013,0.2553,
	0.3074,0.2347,0.368,0.2344,0.2708,0.3335,0.2819,0.3241,0.3813,0.0333,
	0.0361,0.1145,0.1174,0.293,0.2769,0.0472,0.5063,0.1874,0.3625,0.1862,
	0.1945,0.3051,0.1173,0.3541,0.1007,0.2784,0.0217,0.1173,0.184,0.184,
	0.2347,0.3374,0.1955,0.3514,0.2206,0.3546,0.109,0.2824,0.1708,0.3514,
	0.1958,0.3958,0.3013,0.2485,0.0979,0.2875,0.3013,0.3124,0.4051,0.2764,
	0.2542,0.1285,0.4053,0.1895,0.2375,0.3196,0.2625,0.3735,0.1874,0.3462,
	]
	STEPS = list(range(1, 201))

	# ─── Post-Training Eval Data (REAL) ───
	EVAL_BASE = {"easy": 0.087, "medium": 0.018, "hard": 0.015, "overall": 0.040}
	EVAL_TRAINED = {"easy": 0.287, "medium": 0.129, "hard": 0.044, "overall": 0.153}


	def make_reward_plot():
	import matplotlib; matplotlib.use('Agg')
	import matplotlib.pyplot as plt

	w = 10
	avg = [float(np.mean(REWARDS_200[max(0,i-w+1):i+1])) for i in range(200)]

	fig, ax = plt.subplots(figsize=(14, 6), facecolor='#0d1117')
	ax.set_facecolor('#161b22')
	ax.tick_params(colors='#c9d1d9', labelsize=11)
	for s in ax.spines.values(): s.set_color('#30363d')
	ax.grid(True, alpha=0.15, color='#58a6ff')

	ax.fill_between(STEPS, REWARDS_200, alpha=0.18, color='#58a6ff')
	ax.plot(STEPS, REWARDS_200, '-', color='#58a6ff', linewidth=1.0, alpha=0.6, label='Step Reward')
	ax.plot(STEPS, avg, '-', color='#f0883e', linewidth=3, label=f'Running Avg (w={w})')

	# Phase bands
	ax.axvspan(1, 120, alpha=0.06, color='#3fb950')
	ax.axvspan(120, 170, alpha=0.06, color='#f0883e')
	ax.axvspan(170, 200, alpha=0.06, color='#f85149')
	ax.text(60, 0.02, 'WARM-UP', color='#3fb950', fontsize=12, ha='center', alpha=0.9, fontweight='bold')
	ax.text(145, 0.02, 'SCALING', color='#f0883e', fontsize=12, ha='center', alpha=0.9, fontweight='bold')
	ax.text(185, 0.02, 'HARD', color='#f85149', fontsize=12, ha='center', alpha=0.9, fontweight='bold')

	# Peak annotation
	peak_i = int(np.argmax(REWARDS_200))
	ax.annotate(f'Peak: {REWARDS_200[peak_i]:.3f}', xy=(STEPS[peak_i], REWARDS_200[peak_i]),
	xytext=(STEPS[peak_i]-40, REWARDS_200[peak_i]+0.08),
	arrowprops=dict(arrowstyle='->', color='#ff7b72', lw=2),
	fontsize=13, fontweight='bold', color='#ff7b72',
	bbox=dict(boxstyle='round,pad=0.3', facecolor='#21262d', edgecolor='#ff7b72', alpha=0.9))

	ax.set_xlabel('Training Step', color='#c9d1d9', fontsize=13)
	ax.set_ylabel('Mean Reward', color='#c9d1d9', fontsize=13)
	ax.set_title('GRPO 200-Step Reward Curve — Qwen2.5-3B-Instruct \| 4-bit QLoRA \| Tesla T4',
	color='#f0f6fc', fontsize=14, fontweight='bold', pad=12)
	ax.legend(fontsize=11, facecolor='#21262d', edgecolor='#30363d', labelcolor='#f0f6fc')
	ax.set_xlim(0.5, 200.5)
	plt.tight_layout()
	return fig


	def make_comparison_plot():
	import matplotlib; matplotlib.use('Agg')
	import matplotlib.pyplot as plt

	fig, ax = plt.subplots(figsize=(10, 6), facecolor='#0d1117')
	ax.set_facecolor('#161b22')
	ax.tick_params(colors='#c9d1d9', labelsize=11)
	for s in ax.spines.values(): s.set_color('#30363d')
	ax.grid(True, alpha=0.15, color='#58a6ff', axis='y')

	diffs = ['Easy', 'Medium', 'Hard', 'Overall']
	base = [0.087, 0.018, 0.015, 0.040]
	trained = [0.287, 0.129, 0.044, 0.153]
	x = np.arange(4)
	w = 0.35

	b1 = ax.bar(x - w/2, base, w, label='Base Model', color='#f85149', alpha=0.9, edgecolor='#ff7b72', linewidth=0.5)
	b2 = ax.bar(x + w/2, trained, w, label='GRPO-Trained', color='#3fb950', alpha=0.9, edgecolor='#56d364', linewidth=0.5)

	for bar in b1:
	ax.text(bar.get_x()+bar.get_width()/2, bar.get_height()+0.006, f'{bar.get_height():.3f}',
	ha='center', fontsize=11, color='#ff7b72', fontweight='bold')
	for bar in b2:
	ax.text(bar.get_x()+bar.get_width()/2, bar.get_height()+0.006, f'{bar.get_height():.3f}',
	ha='center', fontsize=11, color='#56d364', fontweight='bold')

	imps = ['+230%', '+617%', '+193%', '+283%']
	for i, imp in enumerate(imps):
	ax.text(x[i]+w/2, trained[i]+0.025, imp, ha='center', fontsize=10, color='#f0883e', fontweight='bold',
	bbox=dict(boxstyle='round,pad=0.2', facecolor='#21262d', edgecolor='#f0883e', alpha=0.8))

	ax.set_xticks(x)
	ax.set_xticklabels(diffs, color='#f0f6fc', fontsize=12, fontweight='bold')
	ax.set_ylabel('Episode Score', color='#c9d1d9', fontsize=13)
	ax.set_title('Base vs GRPO-Trained — Post-Training Evaluation (5 seeds × 3 difficulties)',
	color='#f0f6fc', fontsize=14, fontweight='bold', pad=12)
	ax.legend(fontsize=11, facecolor='#21262d', edgecolor='#30363d', labelcolor='#f0f6fc')
	ax.set_ylim(0, 0.38)
	plt.tight_layout()
	return fig


	# ─── CSS ───
	CSS = """
	.gradio-container { max-width: 1200px !important; margin: auto !important; }
	.header-banner {
	background: linear-gradient(135deg, #0a0e17 0%, #1a1030 40%, #0d2137 100%);
	border: 1px solid #2d1b69; border-radius: 16px;
	padding: 28px 36px; margin-bottom: 20px; text-align: center;
	box-shadow: 0 4px 20px rgba(88, 166, 255, 0.1);
	}
	.header-banner h1 { color: #f0f6fc !important; font-size: 2.2em !important; margin-bottom: 4px !important; }
	.header-banner p { color: #8b949e !important; font-size: 1.1em !important; }
	.stat-card {
	background: linear-gradient(135deg, #0f1520, #1a1030);
	border: 1px solid #2d1b69; border-radius: 12px;
	padding: 18px 22px; text-align: center;
	box-shadow: 0 2px 10px rgba(88, 166, 255, 0.05);
	transition: transform 0.2s;
	}
	.stat-card:hover { transform: translateY(-2px); border-color: #58a6ff; }
	.stat-card h3 { color: #58a6ff !important; font-size: 2.2em !important; margin: 0 !important; }
	.stat-card p { color: #8b949e !important; margin: 4px 0 0 0 !important; font-size: 0.95em; }
	.improvement { color: #3fb950 !important; font-size: 1.2em; font-weight: bold; }
	footer { display: none !important; }
	"""


	def build_app():
	with gr.Blocks(title="SynthAudit.Env — AI Oversight Dashboard", css=CSS, theme=gr.themes.Base()) as demo:

	gr.HTML("""
	<div class="header-banner">
	<h1>🩺 SynthAudit.Env</h1>
	<p>Multi-Agent Clinical AI Oversight — 200-Step GRPO Reinforcement Learning</p>
	<p style="margin-top: 8px; color: #58a6ff !important; font-size: 0.95em;">
	AI that watches AI • Colab T4 GPU • 283% improvement over baseline
	</p>
	<p style="margin-top: 14px;">
	<a href="https://github.com/sumitsaraswat362/SynthAudit.Env" target="_blank" style="color: #58a6ff; text-decoration: none; margin: 0 10px;">📦 GitHub</a> \|
	<a href="https://huggingface.co/Timusgeorge/SynthAudit-Qwen2.5-3B-GRPO" target="_blank" style="color: #f0883e; text-decoration: none; margin: 0 10px;">🤗 Model</a>
	</p>
	</div>
	""")

	# Stats row
	with gr.Row():
	gr.HTML('<div class="stat-card"><h3>+283%</h3><p>Improvement Over Base</p></div>')
	gr.HTML('<div class="stat-card"><h3>0.506</h3><p>Peak GRPO Reward</p></div>')
	gr.HTML('<div class="stat-card"><h3>200</h3><p>Training Steps</p></div>')
	gr.HTML('<div class="stat-card"><h3>8</h3><p>Oversight Tools</p></div>')
	gr.HTML('<div class="stat-card"><h3>4×</h3><p>More Errors Caught</p></div>')

	with gr.Tabs():

	# Tab 1: Training Results
	with gr.Tab("📈 200-Step GRPO Training"):
	gr.Markdown("### Reward Curve — 200 Steps on Free Colab T4\nQwen2.5-3B-Instruct \| 4-bit QLoRA via Unsloth \| 3-Phase Curriculum")
	gr.Plot(value=make_reward_plot())
	gr.Markdown("""
	### Training Configuration

	\| Parameter \| Value \| \| Parameter \| Value \|
	\|---\|---\|---\|---\|---\|
	\| Base Model \| Qwen2.5-3B-Instruct \| \| LoRA Rank \| 16 \|
	\| Quantization \| 4-bit QLoRA (Unsloth) \| \| Algorithm \| GRPO (TRL) \|
	\| GPU \| Tesla T4 (free Colab) \| \| Training Time \| 2h 20m \|
	\| Steps \| 200 \| \| Peak Reward \| 0.506 (Step 157) \|
	\| Hardware \| Free Colab T4 \| \| Final Reward \| 0.346 \|

	### What The Model Learned (Zero Supervised Data)

	\| Capability \| Before Training \| After 200 Steps \|
	\|---\|---\|---\|
	\| Tool Calling \| Only `review_proposal` \| Full chain: review → investigate → flag/approve \|
	\| Patient ID Mapping \| Random/wrong IDs \| Correct patient-proposal matching \|
	\| Error Detection \| 0.13 errors/episode \| 0.53 errors/episode (4× more) \|
	\| Decision Quality \| Random flagging \| Investigate first, then decide \|
	\| Score \| 0.040 \| 0.153 (+283%) \|
	""")

	# Tab 2: Evaluation
	with gr.Tab("⚔️ Base vs Trained"):
	gr.Markdown("### Post-Training Evaluation — 5 Seeds × 3 Difficulties\nSame environment, same reward model, fair head-to-head comparison")
	gr.Plot(value=make_comparison_plot())
	gr.Dataframe(
	headers=["Metric", "Base Model", "GRPO-Trained", "Improvement"],
	value=[
	["Easy", "0.087", "0.287", "↑ 230%"],
	["Medium", "0.018", "0.129", "↑ 617%"],
	["Hard", "0.015", "0.044", "↑ 193%"],
	["OVERALL", "0.040", "0.153", "↑ 283%"],
	["Correct Flags", "2", "8", "4× more"],
	["False Positives", "6", "11", "—"],
	],
	interactive=False,
	)
	gr.Markdown("""
	> Key Insight: Medium difficulty saw the largest improvement (+617%) — this is the sweet spot where
	> GRPO training adds the most value. The model learned to handle mixed error types that pure heuristics cannot solve.
	""")

	# Tab 3: Architecture
	with gr.Tab("🏗️ Architecture"):
	gr.Markdown("""
	### Multi-Agent Oversight Architecture

	```
	╔══════════════════════════════════════════════════════════════╗
	║ SynthAudit.Env (OpenEnv) ║
	║ ║
	║ ┌────────────────┐ ┌──────────────────────────┐ ║
	║ │ ACTOR AGENT │────────▷│ CLINICAL WORLD STATE │ ║
	║ │ (Frozen LLM) │ │ • 40-80 patient EHRs │ ║
	║ │ Generates │ │ • Protocol-specific rules│ ║
	║ │ proposals │ │ • Adversarial errors │ ║
	║ │ with subtle │ │ • Bias signals + noise │ ║
	║ │ reasoning │ └──────────────────────────┘ ║
	║ │ flaws │ │ ║
	║ └────────────────┘ │ Observations ║
	║ │ Proposals ▼ ║
	║ ▼ ║
	║ ┌──────────────────────────────────────────────────────┐ ║
	║ │ OVERSIGHT AGENT (GRPO-Trained) │ ║
	║ │ 8 Tools: │ ║
	║ │ ├─ review_proposal See Actor reasoning │ ║
	║ │ ├─ investigate_patient Raw EHR data │ ║
	║ │ ├─ request_shap Feature attribution │ ║
	║ │ ├─ cohort_analysis Statistical bias detection │ ║
	║ │ ├─ temporal_audit Timeline consistency │ ║
	║ │ ├─ flag_error Flag with Theory-of-Mind │ ║
	║ │ ├─ approve Approve correct proposals │ ║
	║ │ └─ submit_audit_report End episode │ ║
	║ └──────────────────────────────────────────────────────┘ ║
	║ ║
	║ ┌──────────────────────────────────────────────────────┐ ║
	║ │ DENSE SHAPED REWARD MODEL │ ║
	║ │ F-β score (β=1.5): recall > precision │ ║
	║ │ +0.30 correct flag \| +0.12 relevant SHAP │ ║
	║ │ +0.05 Theory-of-Mind \| -0.25 false positive │ ║
	║ │ -0.003/step cost \| Trajectory bonus │ ║
	║ └──────────────────────────────────────────────────────┘ ║
	╚══════════════════════════════════════════════════════════════╝
	```

	### 4 Adversarial Error Types

	\| Error Type \| What It Tests \| Difficulty \|
	\|---\|---\|---\|
	\| Age Boundary \| Patient age vs protocol-specific range \| ★☆☆ \|
	\| Temporal Inconsistency \| death_date < treatment_start \| ★★☆ \|
	\| Protocol Window Violation \| enrollment→treatment delay exceeds limit \| ★★☆ \|
	\| 2-Hop Comorbidity Override \| Stage IV exception revoked by high comorbidity \| ★★★ \|

	### Actor's Deception Techniques
	- Hallucinated amendments: "Per Section 4.2.1(b)..." (section doesn't exist)
	- Anchoring bias: Focuses on irrelevant features, buries real violations
	- Fake citations: Fabricated survival curves with plausible HRs and CIs
	- Wrong rule application: Applies Stage IV exception without checking overrides
	""")

	# Tab 4: Interactive Environment Simulator
	with gr.Tab("🔬 Environment Simulator"):
	gr.Markdown("""### 🔬 SynthAudit Environment Simulator
	This demonstrates the environment, not the LLM. It shows how SynthAudit.Env generates adversarial clinical scenarios, injects hidden errors, and scores agent actions using our dense reward model.

	The actual trained model's results (real LLM inference) are in the ⚔️ Base vs Trained tab.

	👇 Try generating different scenarios to see the variety of adversarial cases our environment produces:""")

	with gr.Row():
	seed_input = gr.Slider(1, 999, value=42, step=1, label="Random Seed")
	diff_input = gr.Radio(["easy", "medium", "hard"], value="easy", label="Difficulty")
	gen_btn = gr.Button("🎲 Generate Scenario", variant="primary")

	scenario_out = gr.Markdown(value="Click 'Generate Scenario' to begin")
	audit_btn = gr.Button("⚡ Simulate Perfect Agent (shows reward scoring)", variant="secondary", interactive=False)
	audit_out = gr.Markdown(value="")

	def generate_scenario(seed, difficulty):
	import random; random.seed(int(seed))
	n_map = {"easy": 6, "medium": 10, "hard": 15}
	n = n_map[difficulty]
	age_range = (40, 90)
	err_types = ["age_boundary_error", "temporal_inconsistency",
	"protocol_window_violation", "comorbidity_override_miss"]
	err_rate = {"easy": 0.3, "medium": 0.4, "hard": 0.5}[difficulty]

	patients = []
	for i in range(n):
	pid = f"P{random.randint(1,99):04d}"
	has_err = random.random() < err_rate
	if has_err:
	etype = random.choice(err_types)
	if etype == "age_boundary_error":
	age = random.choice([random.randint(5, 38), random.randint(92, 150)])
	reason_text = f"Age {age} outside protocol range {age_range}"
	elif etype == "temporal_inconsistency":
	age = random.randint(*age_range)
	reason_text = "Death date BEFORE treatment start"
	elif etype == "protocol_window_violation":
	age = random.randint(*age_range)
	delay = random.randint(18, 45)
	reason_text = f"Treatment delay {delay} days exceeds 14-day window"
	else:
	age = random.randint(*age_range)
	ci = round(random.uniform(6.6, 9.5), 1)
	reason_text = f"Stage IV + comorbidity {ci} > 6.5 revokes extension"
	else:
	etype = None
	age = random.randint(*age_range)
	reason_text = None

	conf = round(random.uniform(0.78, 0.96), 2)
	actor_reason = random.choice([
	f"Per Section 4.2.1(b), patient meets eligibility criteria. ECOG ≤ 2.",
	f"Lab values within normal range. Age {age} noted. No actionable violations.",
	f"Consistent with published survival curves (HR=0.78, 95% CI: 0.62-0.94).",
	f"Protocol-compliant. All screening criteria satisfied per amendment dated 2023-11.",
	])
	patients.append({
	"pid": pid, "age": age, "prop_id": f"PROP-{i+1:03d}",
	"conf": conf, "error": etype, "reason": reason_text,
	"actor_reason": actor_reason
	})

	# Build output
	errs = [p for p in patients if p["error"]]
	md = f"## 🏥 Protocol: `ONCO-AX-{seed:04X}`\n"
	md += f"Eligibility: Age {age_range[0]}-{age_range[1]} \| Treatment within 14 days \| Stage IV: 21-day window \| ⚠️ Comorbidity > 6.5 revokes extension\n\n"
	md += f"{n} proposals \| {len(errs)} hidden errors \| Difficulty: {difficulty.upper()}\n\n"
	md += "\| # \| Proposal \| Patient \| Confidence \| Actor's Reasoning \|\n\|---\|---\|---\|---\|---\|\n"
	for p in patients:
	icon = "🔴" if p["error"] else "✅"
	md += f"\| {icon} \| {p['prop_id']} \| {p['pid']} (age {p['age']}) \| {p['conf']} \| {p['actor_reason'][:60]}... \|\n"

	md += f"\n> 🎯 Your challenge: The Actor cleared all {n} patients. Can the Oversight Agent find the {len(errs)} hidden errors?\n"

	return md, gr.update(interactive=True), patients

	state = gr.State([])

	def run_audit(patients):
	if not patients:
	return "⚠️ Generate a scenario first!"
	md = "## 🩺 Oversight Agent Audit Trail\n\n"
	md += "\| Step \| Action \| Target \| Finding \| Reward \|\n\|---\|---\|---\|---\|---\|\n"
	step = 0; total_reward = 0; correct = 0; fps = 0; total_err = 0

	for p in patients:
	if p["error"]: total_err += 1
	step += 1
	md += f"\| {step} \| `review_proposal` \| {p['prop_id']} \| 📋 Reviewed Actor reasoning \| +0.04 \|\n"
	total_reward += 0.04
	step += 1
	if p["error"]:
	if p["error"] == "age_boundary_error":
	finding = f"⚠️ Age {p['age']} outside protocol range!"
	elif p["error"] == "temporal_inconsistency":
	finding = "⚠️ Death date before treatment start!"
	elif p["error"] == "protocol_window_violation":
	finding = f"⚠️ Treatment delay exceeds 14 days!"
	else:
	finding = "⚠️ Stage IV + high comorbidity — extension revoked!"
	md += f"\| {step} \| `investigate_patient` \| {p['pid']} \| {finding} \| +0.10 \|\n"
	total_reward += 0.10
	step += 1
	md += f"\| {step} \| `flag_error` \| {p['prop_id']} → `{p['error']}` \| 🎯 CORRECT FLAG! {p['reason']} \| +0.30 \|\n"
	total_reward += 0.30
	correct += 1
	else:
	md += f"\| {step} \| `investigate_patient` \| {p['pid']} \| ✅ Age {p['age']}, within range \| +0.02 \|\n"
	total_reward += 0.02
	step += 1
	md += f"\| {step} \| `approve` \| {p['prop_id']} \| ✅ Correct approval \| +0.15 \|\n"
	total_reward += 0.15

	score = round(total_reward / max(1, step) * 2, 3)
	md += f"\n---\n### 🏆 Episode Summary\n"
	md += f"\| Metric \| Value \|\n\|---\|---\|\n"
	md += f"\| Errors Found \| {correct}/{total_err} \|\n"
	md += f"\| False Positives \| {fps} \|\n"
	md += f"\| Total Reward \| {total_reward:.2f} \|\n"
	md += f"\| Steps Taken \| {step} \|\n"
	if correct == total_err:
	md += f"\n> 🎉 PERFECT AUDIT — All {total_err} errors detected, 0 false positives!"
	return md

	gen_btn.click(generate_scenario, [seed_input, diff_input], [scenario_out, audit_btn, state])
	audit_btn.click(run_audit, [state], [audit_out])

	# Tab 5: About
	with gr.Tab("📋 About"):
	gr.Markdown("""
	### The Problem
	40,000+ patients die annually from diagnostic errors [(Johns Hopkins, BMJ 2016)](https://www.hopkinsmedicine.org/news/media/releases/study_suggests_medical_errors_now_third_leading_cause_of_death_in_the_us).
	As AI deploys in clinical trials: Who audits the AI?

	### Our Solution
	An Oversight Agent trained with GRPO learns to catch errors from an Actor Agent.
	8 tools, multi-step reasoning, Theory-of-Mind scoring — all through pure RL.

	### Key Results
	- 283% improvement over untrained baseline
	- 4× more clinical errors correctly detected
	- Free Colab T4 — trained in 2h 20m on 15.6 GB VRAM
	- 200 GRPO steps in 2 hours 20 minutes

	### Links
	\| Resource \| URL \|
	\|---\|---\|
	\| GitHub \| [sumitsaraswat362/SynthAudit.Env](https://github.com/sumitsaraswat362/SynthAudit.Env) \|
	\| Model \| [Timusgeorge/SynthAudit-Qwen2.5-3B-GRPO](https://huggingface.co/Timusgeorge/SynthAudit-Qwen2.5-3B-GRPO) \|

	### Citation
	```bibtex
	@misc{saraswat2026synthaudit,
	title={SynthAudit.Env: Multi-Agent Clinical AI Oversight via GRPO},
	author={Sumit Saraswat},
	year={2026},
	url={https://github.com/sumitsaraswat362/SynthAudit.Env}
	}
	```

	Built for Meta PyTorch OpenEnv Hackathon × Scaler SST 2026 \| Solo entry by Sumit Saraswat
	""")

	gr.Markdown(
	"<center style='color: #8b949e; margin-top: 16px;'>"
	"🩺 SynthAudit.Env — AI that watches AI \| "
	"<a href='https://github.com/sumitsaraswat362/SynthAudit.Env' style='color: #58a6ff;'>GitHub</a> \| "
	"<a href='https://huggingface.co/Timusgeorge/SynthAudit-Qwen2.5-3B-GRPO' style='color: #f0883e;'>Model</a>"
	"</center>"
	)

	return demo


	demo = build_app()

	if __name__ == "__main__":
	demo.launch()