Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """Optional allow-listed crawler (disabled by default).""" | |
| from __future__ import annotations | |
| import json | |
| import os | |
| from pathlib import Path | |
| from app.dataops.source_manager import DataAcquisitionAgent | |
| def main() -> None: | |
| enabled = os.getenv("POLYGUARD_ALLOW_WEB_FETCH", "false").lower() == "true" | |
| if not enabled: | |
| print("web_fetch_disabled") | |
| return | |
| root = Path(__file__).resolve().parents[1] | |
| out = root / "data" / "raw" / "literature_cache" | |
| out.mkdir(parents=True, exist_ok=True) | |
| agent = DataAcquisitionAgent(root=root, allow_domains=["who.int", "nih.gov", "fda.gov"]) | |
| records = [ | |
| agent.acquire_web_knowledge("https://www.who.int"), | |
| agent.acquire_web_knowledge("https://www.nih.gov"), | |
| ] | |
| (out / "label_guideline_records.json").write_text(json.dumps(records, ensure_ascii=True, indent=2), encoding="utf-8") | |
| print("web_fetch_complete") | |
| if __name__ == "__main__": | |
| main() | |