| |
| """Optional allow-listed crawler (disabled by default).""" |
|
|
| from __future__ import annotations |
|
|
| import json |
| import os |
| from pathlib import Path |
|
|
| from app.dataops.source_manager import DataAcquisitionAgent |
|
|
|
|
| def main() -> None: |
| enabled = os.getenv("POLYGUARD_ALLOW_WEB_FETCH", "false").lower() == "true" |
| if not enabled: |
| print("web_fetch_disabled") |
| return |
| root = Path(__file__).resolve().parents[1] |
| out = root / "data" / "raw" / "literature_cache" |
| out.mkdir(parents=True, exist_ok=True) |
| agent = DataAcquisitionAgent(root=root, allow_domains=["who.int", "nih.gov", "fda.gov"]) |
| records = [ |
| agent.acquire_web_knowledge("https://www.who.int"), |
| agent.acquire_web_knowledge("https://www.nih.gov"), |
| ] |
| (out / "label_guideline_records.json").write_text(json.dumps(records, ensure_ascii=True, indent=2), encoding="utf-8") |
| print("web_fetch_complete") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|