from __future__ import annotations import json from pathlib import Path from scripts.acceptance_gate import run_checks def _write(path: Path, payload: str) -> None: path.parent.mkdir(parents=True, exist_ok=True) path.write_text(payload, encoding="utf-8") def _json(path: Path, payload: dict) -> None: _write(path, json.dumps(payload, ensure_ascii=True, indent=2)) def _minimal_project(root: Path) -> None: for rel in [ "openenv.yaml", "__init__.py", "client.py", "models.py", "server/__init__.py", "server/app.py", "app/env/env_core.py", "app/env/fastapi_app.py", "app/env/client.py", "app/agents/orchestrator.py", "app/training/grpo_trl.py", "app/hf_space/training_runner.py", "scripts/deploy_training_space.py", "scripts/pull_training_artifacts.py", "scripts/generate_hf_training_report.py", "scripts/train_sft_trl.py", "scripts/train_grpo_trl.py", "scripts/evaluate_policy_ablations.py", "scripts/merge_adapters_safe.py", "scripts/test_inference_postsave.py", "scripts/deploy_space.sh", "scripts/bootstrap_openenv.sh", "docs/training.md", "docs/deployment.md", "docs/evaluation.md", "docs/submission_checklist.md", ]: _write(root / rel, "x\n") for rel in [ "data/processed/normalized_drugs.parquet", "data/processed/drug_classes.parquet", "data/processed/interactions.parquet", "data/processed/burden_rules.yaml", "data/processed/taper_rules.yaml", "data/processed/substitution_rules.yaml", "data/processed/retrieval_corpus.jsonl", "data/processed/graph_edges.parquet", "data/processed/patients_synthetic.parquet", "data/processed/provenance_manifest.json", "data/processed/feature_dictionary.json", "data/scenarios/scenarios_easy.jsonl", "data/scenarios/scenarios_medium.jsonl", "data/scenarios/scenarios_hard.jsonl", "outputs/reports/benchmark_report.json", "outputs/reports/baselines.json", ]: _write(root / rel, "x\n") def test_strict_acceptance_gate_flags_submission_blockers(tmp_path: Path) -> None: _minimal_project(tmp_path) _write( tmp_path / "README.md", """ # PolyGuard ## Problem Statement ## Environment ## Capabilities ## Tasks ## Reward Model / Evaluation Logic ## Post-Training Strategy - GitHub Repo URL: https://github.com/your-username/polyguard-openenv - HF Space URL: https://huggingface.co/spaces/your-username/polyguard-openenv - Colab Notebook URL: https://colab.research.google.com/drive/your-colab-id - YouTube Video URL: https://www.youtube.com/watch?v=your-video-id - Hugging Face Blog URL: https://huggingface.co/blog/your-polyguard-post """, ) _json(tmp_path / "outputs/reports/sft_trl_run.json", {"backend": "fallback_sklearn"}) _json( tmp_path / "outputs/reports/grpo_trl_run.json", {"status": "fallback", "backend": "env_reward_fallback", "artifact_path": ""}, ) _json(tmp_path / "outputs/reports/postsave_inference.json", {"model_source": "fallback_policy"}) _json(tmp_path / "outputs/reports/improvement_report.json", {"improved": False}) summary = run_checks(root=tmp_path, strict_submission_links=True) assert summary["status"] == "fail" assert summary["submission_ready"] is False assert "README placeholder links present" in summary["strict_submission_failures"] assert "SFT report status is not ok" in summary["strict_submission_failures"] assert "SFT report uses fallback backend" in summary["strict_submission_failures"] assert "SFT artifact path is empty or missing" in summary["strict_submission_failures"] assert "SFT report has no training examples" in summary["strict_submission_failures"] assert "GRPO report status is not ok" in summary["strict_submission_failures"] assert "GRPO artifact path is empty or missing" in summary["strict_submission_failures"] assert "post-save inference uses fallback policy" in summary["strict_submission_failures"] assert "improvement report is not positive" in summary["strict_submission_failures"] assert "tracked result assets missing" in summary["strict_submission_failures"] assert "HF deployment verification missing" in summary["strict_submission_failures"] assert "HF training sweep summary missing" in summary["strict_submission_failures"] assert "anti-hacking/overfit report is not passing" in summary["strict_submission_failures"] assert "HF sweep charts missing" in summary["strict_submission_failures"] def test_strict_acceptance_gate_passes_when_submission_evidence_exists(tmp_path: Path) -> None: _minimal_project(tmp_path) _write( tmp_path / "README.md", """ # PolyGuard ## Problem Statement ## Environment ## Capabilities ## Tasks ## Reward Model / Evaluation Logic ## Post-Training Strategy - GitHub Repo URL: https://github.com/Vishwa-docs/Meta_Pytorch_OpenEnv_Scaler_VK - HF Space URL: https://huggingface.co/spaces/vishwa-docs/polyguard-openenv - Colab Notebook URL: https://colab.research.google.com/drive/real-polyguard-colab - YouTube Video URL: https://www.youtube.com/watch?v=realvide01 - Hugging Face Blog URL: https://huggingface.co/blog/vishwa-docs/polyguard-openenv """, ) _json( tmp_path / "outputs/reports/sft_trl_run.json", { "status": "ok", "backend": "trl_transformers", "examples_used": 32, "artifact_path": "checkpoints/sft_adapter", }, ) _json( tmp_path / "outputs/reports/grpo_trl_run.json", {"status": "ok", "backend": "trl_transformers", "artifact_path": "checkpoints/grpo_adapter"}, ) _json(tmp_path / "outputs/reports/postsave_inference.json", {"model_source": "sft_adapter"}) _json(tmp_path / "outputs/reports/improvement_report.json", {"improved": True}) _json( tmp_path / "outputs/reports/hf_sweep_summary.json", { "completed_models": 1, "models": [ { "status": "completed", "label": "Qwen2.5-0.5B", "fallback_detected": False, "reward_range_ok": True, "artifact_paths": { "sft": "checkpoints/sweeps/qwen/sft_adapter", "grpo": "checkpoints/sweeps/qwen/grpo_adapter", }, } ], }, ) _json(tmp_path / "outputs/reports/anti_hacking_overfit_report.json", {"passed": True}) _json(tmp_path / "docs/results/hf_space_verification.json", {"passed": True}) _write(tmp_path / "docs/results/avg_reward.png", "png\n") _write(tmp_path / "docs/results/policy_stack_avg_reward.png", "png\n") for rel in [ "outputs/plots/sft_vs_grpo_reward.png", "outputs/plots/sft_loss_curves.png", "outputs/plots/qwen_model_sft_reward.png", "outputs/plots/qwen_model_sft_loss.png", "outputs/plots/sft_validity_reward.png", "outputs/plots/grpo_reward_curves.png", "outputs/plots/qwen_model_grpo_reward.png", "outputs/plots/reward_component_bars.png", "outputs/plots/anti_cheat_failure_rates.png", "outputs/plots/train_holdout_gap.png", "outputs/plots/inference_validity_reward.png", "outputs/plots/inference_latency_validity.png", ]: _write(tmp_path / rel, "png\n") summary = run_checks(root=tmp_path, strict_submission_links=True) assert summary["status"] == "ok" assert summary["submission_ready"] is True assert summary["strict_submission_failures"] == []