Spaces:
Running
Running
File size: 972 Bytes
877add7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 | #!/usr/bin/env python3
"""Optional allow-listed crawler (disabled by default)."""
from __future__ import annotations
import json
import os
from pathlib import Path
from app.dataops.source_manager import DataAcquisitionAgent
def main() -> None:
enabled = os.getenv("POLYGUARD_ALLOW_WEB_FETCH", "false").lower() == "true"
if not enabled:
print("web_fetch_disabled")
return
root = Path(__file__).resolve().parents[1]
out = root / "data" / "raw" / "literature_cache"
out.mkdir(parents=True, exist_ok=True)
agent = DataAcquisitionAgent(root=root, allow_domains=["who.int", "nih.gov", "fda.gov"])
records = [
agent.acquire_web_knowledge("https://www.who.int"),
agent.acquire_web_knowledge("https://www.nih.gov"),
]
(out / "label_guideline_records.json").write_text(json.dumps(records, ensure_ascii=True, indent=2), encoding="utf-8")
print("web_fetch_complete")
if __name__ == "__main__":
main()
|