#!/usr/bin/env python3 """Optional allow-listed crawler (disabled by default).""" from __future__ import annotations import json import os from pathlib import Path from app.dataops.source_manager import DataAcquisitionAgent def main() -> None: enabled = os.getenv("POLYGUARD_ALLOW_WEB_FETCH", "false").lower() == "true" if not enabled: print("web_fetch_disabled") return root = Path(__file__).resolve().parents[1] out = root / "data" / "raw" / "literature_cache" out.mkdir(parents=True, exist_ok=True) agent = DataAcquisitionAgent(root=root, allow_domains=["who.int", "nih.gov", "fda.gov"]) records = [ agent.acquire_web_knowledge("https://www.who.int"), agent.acquire_web_knowledge("https://www.nih.gov"), ] (out / "label_guideline_records.json").write_text(json.dumps(records, ensure_ascii=True, indent=2), encoding="utf-8") print("web_fetch_complete") if __name__ == "__main__": main()