File size: 972 Bytes
877add7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
#!/usr/bin/env python3
"""Optional allow-listed crawler (disabled by default)."""

from __future__ import annotations

import json
import os
from pathlib import Path

from app.dataops.source_manager import DataAcquisitionAgent


def main() -> None:
    enabled = os.getenv("POLYGUARD_ALLOW_WEB_FETCH", "false").lower() == "true"
    if not enabled:
        print("web_fetch_disabled")
        return
    root = Path(__file__).resolve().parents[1]
    out = root / "data" / "raw" / "literature_cache"
    out.mkdir(parents=True, exist_ok=True)
    agent = DataAcquisitionAgent(root=root, allow_domains=["who.int", "nih.gov", "fda.gov"])
    records = [
        agent.acquire_web_knowledge("https://www.who.int"),
        agent.acquire_web_knowledge("https://www.nih.gov"),
    ]
    (out / "label_guideline_records.json").write_text(json.dumps(records, ensure_ascii=True, indent=2), encoding="utf-8")
    print("web_fetch_complete")


if __name__ == "__main__":
    main()