Spaces:

TheJackBright
/

polyguard-openenv-workbench

Sleeping

App Files Files Community

polyguard-openenv-workbench / polyguard-rl /tests /test_acceptance_gate.py

TheJackBright

Deploy GitHub root master to Space

c296d62 12 days ago

raw

history blame contribute delete

7.84 kB

	from __future__ import annotations

	import json
	from pathlib import Path

	from scripts.acceptance_gate import run_checks


	def _write(path: Path, payload: str) -> None:
	path.parent.mkdir(parents=True, exist_ok=True)
	path.write_text(payload, encoding="utf-8")


	def _json(path: Path, payload: dict) -> None:
	_write(path, json.dumps(payload, ensure_ascii=True, indent=2))


	def _minimal_project(root: Path) -> None:
	for rel in [
	"openenv.yaml",
	"__init__.py",
	"client.py",
	"models.py",
	"server/__init__.py",
	"server/app.py",
	"app/env/env_core.py",
	"app/env/fastapi_app.py",
	"app/env/client.py",
	"app/agents/orchestrator.py",
	"app/training/grpo_trl.py",
	"app/hf_space/training_runner.py",
	"scripts/deploy_training_space.py",
	"scripts/pull_training_artifacts.py",
	"scripts/generate_hf_training_report.py",
	"scripts/train_sft_trl.py",
	"scripts/train_grpo_trl.py",
	"scripts/evaluate_policy_ablations.py",
	"scripts/merge_adapters_safe.py",
	"scripts/test_inference_postsave.py",
	"scripts/deploy_space.sh",
	"scripts/bootstrap_openenv.sh",
	"docs/training.md",
	"docs/deployment.md",
	"docs/evaluation.md",
	"docs/submission_checklist.md",
	]:
	_write(root / rel, "x\n")

	for rel in [
	"data/processed/normalized_drugs.parquet",
	"data/processed/drug_classes.parquet",
	"data/processed/interactions.parquet",
	"data/processed/burden_rules.yaml",
	"data/processed/taper_rules.yaml",
	"data/processed/substitution_rules.yaml",
	"data/processed/retrieval_corpus.jsonl",
	"data/processed/graph_edges.parquet",
	"data/processed/patients_synthetic.parquet",
	"data/processed/provenance_manifest.json",
	"data/processed/feature_dictionary.json",
	"data/scenarios/scenarios_easy.jsonl",
	"data/scenarios/scenarios_medium.jsonl",
	"data/scenarios/scenarios_hard.jsonl",
	"outputs/reports/benchmark_report.json",
	"outputs/reports/baselines.json",
	]:
	_write(root / rel, "x\n")


	def test_strict_acceptance_gate_flags_submission_blockers(tmp_path: Path) -> None:
	_minimal_project(tmp_path)
	_write(
	tmp_path / "README.md",
	"""
	# PolyGuard
	## Problem Statement
	## Environment
	## Capabilities
	## Tasks
	## Reward Model / Evaluation Logic
	## Post-Training Strategy
	- GitHub Repo URL: https://github.com/your-username/polyguard-openenv
	- HF Space URL: https://huggingface.co/spaces/your-username/polyguard-openenv
	- Colab Notebook URL: https://colab.research.google.com/drive/your-colab-id
	- YouTube Video URL: https://www.youtube.com/watch?v=your-video-id
	- Hugging Face Blog URL: https://huggingface.co/blog/your-polyguard-post
	""",
	)
	_json(tmp_path / "outputs/reports/sft_trl_run.json", {"backend": "fallback_sklearn"})
	_json(
	tmp_path / "outputs/reports/grpo_trl_run.json",
	{"status": "fallback", "backend": "env_reward_fallback", "artifact_path": ""},
	)
	_json(tmp_path / "outputs/reports/postsave_inference.json", {"model_source": "fallback_policy"})
	_json(tmp_path / "outputs/reports/improvement_report.json", {"improved": False})

	summary = run_checks(root=tmp_path, strict_submission_links=True)

	assert summary["status"] == "fail"
	assert summary["submission_ready"] is False
	assert "README placeholder links present" in summary["strict_submission_failures"]
	assert "SFT report status is not ok" in summary["strict_submission_failures"]
	assert "SFT report uses fallback backend" in summary["strict_submission_failures"]
	assert "SFT artifact path is empty or missing" in summary["strict_submission_failures"]
	assert "SFT report has no training examples" in summary["strict_submission_failures"]
	assert "GRPO report status is not ok" in summary["strict_submission_failures"]
	assert "GRPO artifact path is empty or missing" in summary["strict_submission_failures"]
	assert "post-save inference uses fallback policy" in summary["strict_submission_failures"]
	assert "improvement report is not positive" in summary["strict_submission_failures"]
	assert "tracked result assets missing" in summary["strict_submission_failures"]
	assert "HF deployment verification missing" in summary["strict_submission_failures"]
	assert "HF training sweep summary missing" in summary["strict_submission_failures"]
	assert "anti-hacking/overfit report is not passing" in summary["strict_submission_failures"]
	assert "HF sweep charts missing" in summary["strict_submission_failures"]


	def test_strict_acceptance_gate_passes_when_submission_evidence_exists(tmp_path: Path) -> None:
	_minimal_project(tmp_path)
	_write(
	tmp_path / "README.md",
	"""
	# PolyGuard
	## Problem Statement
	## Environment
	## Capabilities
	## Tasks
	## Reward Model / Evaluation Logic
	## Post-Training Strategy
	- GitHub Repo URL: https://github.com/Vishwa-docs/Meta_Pytorch_OpenEnv_Scaler_VK
	- HF Space URL: https://huggingface.co/spaces/vishwa-docs/polyguard-openenv
	- Colab Notebook URL: https://colab.research.google.com/drive/real-polyguard-colab
	- YouTube Video URL: https://www.youtube.com/watch?v=realvide01
	- Hugging Face Blog URL: https://huggingface.co/blog/vishwa-docs/polyguard-openenv
	""",
	)
	_json(
	tmp_path / "outputs/reports/sft_trl_run.json",
	{
	"status": "ok",
	"backend": "trl_transformers",
	"examples_used": 32,
	"artifact_path": "checkpoints/sft_adapter",
	},
	)
	_json(
	tmp_path / "outputs/reports/grpo_trl_run.json",
	{"status": "ok", "backend": "trl_transformers", "artifact_path": "checkpoints/grpo_adapter"},
	)
	_json(tmp_path / "outputs/reports/postsave_inference.json", {"model_source": "sft_adapter"})
	_json(tmp_path / "outputs/reports/improvement_report.json", {"improved": True})
	_json(
	tmp_path / "outputs/reports/hf_sweep_summary.json",
	{
	"completed_models": 1,
	"models": [
	{
	"status": "completed",
	"label": "Qwen2.5-0.5B",
	"fallback_detected": False,
	"reward_range_ok": True,
	"artifact_paths": {
	"sft": "checkpoints/sweeps/qwen/sft_adapter",
	"grpo": "checkpoints/sweeps/qwen/grpo_adapter",
	},
	}
	],
	},
	)
	_json(tmp_path / "outputs/reports/anti_hacking_overfit_report.json", {"passed": True})
	_json(tmp_path / "docs/results/hf_space_verification.json", {"passed": True})
	_write(tmp_path / "docs/results/avg_reward.png", "png\n")
	_write(tmp_path / "docs/results/policy_stack_avg_reward.png", "png\n")
	for rel in [
	"outputs/plots/sft_vs_grpo_reward.png",
	"outputs/plots/sft_loss_curves.png",
	"outputs/plots/qwen_model_sft_reward.png",
	"outputs/plots/qwen_model_sft_loss.png",
	"outputs/plots/sft_validity_reward.png",
	"outputs/plots/grpo_reward_curves.png",
	"outputs/plots/qwen_model_grpo_reward.png",
	"outputs/plots/reward_component_bars.png",
	"outputs/plots/anti_cheat_failure_rates.png",
	"outputs/plots/train_holdout_gap.png",
	"outputs/plots/inference_validity_reward.png",
	"outputs/plots/inference_latency_validity.png",
	]:
	_write(tmp_path / rel, "png\n")

	summary = run_checks(root=tmp_path, strict_submission_links=True)

	assert summary["status"] == "ok"
	assert summary["submission_ready"] is True
	assert summary["strict_submission_failures"] == []