#!/usr/bin/env python3 """Package the currently active PolyGuard model artifacts for implementation use.""" from __future__ import annotations import argparse from datetime import datetime, timezone import json import os from pathlib import Path import shutil import zipfile from typing import Any from huggingface_hub import HfApi ROOT = Path(__file__).resolve().parents[1] DEFAULT_BUNDLE_NAME = "local-qwen-0-5b-active-smoke" DEFAULT_ARTIFACT_REPO = "TheJackBright/polyguard-openenv-training-full-artifacts" def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Package active PolyGuard model artifacts.") parser.add_argument("--bundle-name", default=DEFAULT_BUNDLE_NAME) parser.add_argument("--artifact-repo-id", default=DEFAULT_ARTIFACT_REPO) parser.add_argument("--output-root", default=str(ROOT / "submission_bundle" / "model_artifacts")) parser.add_argument("--upload", action="store_true") parser.add_argument("--no-merged", action="store_true", help="Skip the merged model directory.") parser.add_argument("--zip", action="store_true", default=True) return parser.parse_args() def load_json(path: Path) -> dict[str, Any]: if not path.exists(): return {} try: payload = json.loads(path.read_text(encoding="utf-8")) except json.JSONDecodeError: return {} return payload if isinstance(payload, dict) else {} def copy_tree(source: Path, target: Path) -> dict[str, Any]: if not source.exists(): return {"source": str(source), "target": str(target), "exists": False, "file_count": 0, "bytes": 0} shutil.copytree(source, target, dirs_exist_ok=True, symlinks=False) file_count = 0 total_bytes = 0 for path in target.rglob("*"): if path.is_file(): file_count += 1 total_bytes += path.stat().st_size return { "source": str(source), "target": str(target), "exists": True, "file_count": file_count, "bytes": total_bytes, } def copy_file(source: Path, target: Path) -> dict[str, Any]: if not source.exists(): return {"source": str(source), "target": str(target), "exists": False, "bytes": 0} target.parent.mkdir(parents=True, exist_ok=True) shutil.copy2(source, target) return {"source": str(source), "target": str(target), "exists": True, "bytes": target.stat().st_size} def write_json(path: Path, payload: Any) -> None: path.parent.mkdir(parents=True, exist_ok=True) path.write_text(json.dumps(payload, ensure_ascii=True, indent=2) + "\n", encoding="utf-8") def write_readme(bundle_dir: Path, manifest: dict[str, Any]) -> None: bundle_dir.joinpath("README.md").write_text( "\n".join( [ "# PolyGuard Active Model Artifact Bundle", "", f"Bundle: `{manifest['bundle_name']}`", f"Model: `{manifest.get('model_id', '')}`", f"Base model: `{manifest.get('base_model', '')}`", f"Preferred artifact: `{manifest.get('preferred_artifact', '')}`", "", "This bundle is meant for implementation/testing while the full per-model remote sweep artifacts are still uploading.", "", "## Contents", "", "- `checkpoints/grpo_adapter/`", "- `checkpoints/sft_adapter/`", "- `checkpoints/merged/` when included", "- `manifests/active_model_manifest.json`", "- `reports/`", "", "## Restore Locally", "", "```bash", "cd /Users/daver/Desktop/Meta_Pytorch_OpenEnv_Scaler/polyguard-rl", "cp -R submission_bundle/model_artifacts/local-qwen-0-5b-active-smoke/checkpoints/grpo_adapter checkpoints/grpo_adapter", "cp -R submission_bundle/model_artifacts/local-qwen-0-5b-active-smoke/checkpoints/sft_adapter checkpoints/sft_adapter", "cp -R submission_bundle/model_artifacts/local-qwen-0-5b-active-smoke/checkpoints/merged checkpoints/merged", "mkdir -p checkpoints/active", "cp submission_bundle/model_artifacts/local-qwen-0-5b-active-smoke/manifests/active_model_manifest.json checkpoints/active/active_model_manifest.json", "curl http://127.0.0.1:8200/policy/model_status", "```", "", "## Hugging Face Download", "", "After upload, download with:", "", "```bash", "export HF_TOKEN=\"$(cat ~/.cache/huggingface/token)\"", "huggingface-cli download TheJackBright/polyguard-openenv-training-full-artifacts \\", " --repo-type model \\", " --include 'usable_model_bundles/local-qwen-0-5b-active-smoke/**' \\", " --local-dir ./hf_artifacts", "```", "", "Note: this is the current local active Qwen 0.5B implementation bundle. It is not the final full remote Qwen 0.5B/1.5B sweep checkpoint until those files appear in the HF artifact repo.", "", ] ), encoding="utf-8", ) def zip_bundle(bundle_dir: Path) -> Path: zip_path = bundle_dir.with_suffix(".zip") if zip_path.exists(): zip_path.unlink() with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive: for path in bundle_dir.rglob("*"): if path.is_file() and path.name != ".DS_Store": archive.write(path, arcname=str(path.relative_to(bundle_dir.parent))) return zip_path def package_bundle(*, bundle_name: str, output_root: Path, include_merged: bool) -> dict[str, Any]: active_manifest = load_json(ROOT / "checkpoints" / "active" / "active_model_manifest.json") bundle_dir = output_root / bundle_name if bundle_dir.exists(): shutil.rmtree(bundle_dir) bundle_dir.mkdir(parents=True, exist_ok=True) copies: dict[str, Any] = { "grpo_adapter": copy_tree(ROOT / "checkpoints" / "grpo_adapter", bundle_dir / "checkpoints" / "grpo_adapter"), "sft_adapter": copy_tree(ROOT / "checkpoints" / "sft_adapter", bundle_dir / "checkpoints" / "sft_adapter"), "active_manifest": copy_file( ROOT / "checkpoints" / "active" / "active_model_manifest.json", bundle_dir / "manifests" / "active_model_manifest.json", ), "active_report_manifest": copy_file( ROOT / "outputs" / "reports" / "active_model" / "active_model_manifest.json", bundle_dir / "manifests" / "active_model_report_manifest.json", ), "submission_evidence_manifest": copy_file( ROOT / "outputs" / "reports" / "submission_evidence" / "qwen_0_5b_1_5b" / "manifest.json", bundle_dir / "manifests" / "submission_evidence_manifest.json", ), "reports": copy_tree(ROOT / "outputs" / "reports" / "active_model", bundle_dir / "reports"), } if include_merged: copies["merged"] = copy_tree(ROOT / "checkpoints" / "merged", bundle_dir / "checkpoints" / "merged") manifest = { "status": "ok", "bundle_name": bundle_name, "created_at_utc": datetime.now(timezone.utc).isoformat(), "source": "local_active_model", "run_id": active_manifest.get("run_id", "qwen-qwen2-5-0-5b-instruct"), "label": active_manifest.get("label", "local-qwen-0.5b-active-smoke"), "model_id": active_manifest.get("model_id", "Qwen/Qwen2.5-0.5B-Instruct"), "base_model": active_manifest.get("base_model", "Qwen/Qwen2.5-0.5B-Instruct"), "preferred_artifact": active_manifest.get("preferred_artifact", "grpo_adapter"), "availability": active_manifest.get("availability", {}), "remote_full_sweep_note": ( "The full Qwen 0.5B/1.5B remote sweep artifacts are still pending upload in the HF artifact repo. " "This bundle packages the currently active local trained/smoke artifacts for product integration." ), "copies": copies, } write_json(bundle_dir / "bundle_manifest.json", manifest) write_readme(bundle_dir, manifest) return manifest def upload_bundle(bundle_dir: Path, repo_id: str, bundle_name: str) -> str: token = os.getenv("HF_TOKEN") api = HfApi(token=token) api.create_repo(repo_id=repo_id, repo_type="model", private=True, exist_ok=True) path_in_repo = f"usable_model_bundles/{bundle_name}" api.upload_folder( repo_id=repo_id, repo_type="model", folder_path=str(bundle_dir), path_in_repo=path_in_repo, commit_message=f"Upload PolyGuard usable model bundle: {bundle_name}", ignore_patterns=[".DS_Store", "**/.DS_Store"], ) return path_in_repo def main() -> None: args = parse_args() output_root = Path(args.output_root) manifest = package_bundle(bundle_name=args.bundle_name, output_root=output_root, include_merged=not args.no_merged) bundle_dir = output_root / args.bundle_name zip_path = zip_bundle(bundle_dir) if args.zip else None if zip_path: manifest["zip_path"] = str(zip_path) if args.upload: manifest["hf_repo_id"] = args.artifact_repo_id manifest["hf_path_in_repo"] = upload_bundle(bundle_dir, args.artifact_repo_id, args.bundle_name) manifest["hf_url"] = f"https://huggingface.co/{args.artifact_repo_id}/tree/main/{manifest['hf_path_in_repo']}" write_json(bundle_dir / "bundle_manifest.json", manifest) print(json.dumps(manifest, ensure_ascii=True, indent=2)) if __name__ == "__main__": main()