File size: 1,006 Bytes
877add7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
#!/usr/bin/env python3
"""Offline-first source ingestion entrypoint."""

from __future__ import annotations

import json
import os
from pathlib import Path

from app.dataops.source_manager import DataAcquisitionAgent


def main() -> None:
    root = Path(__file__).resolve().parents[1]
    allow_domains = ["who.int", "nih.gov", "fda.gov", "ema.europa.eu"]
    agent = DataAcquisitionAgent(root=root, allow_domains=allow_domains)
    records = agent.acquire_local_knowledge()
    out_dir = root / "data" / "processed"
    out_dir.mkdir(parents=True, exist_ok=True)
    payload: dict[str, object] = {"local_records": records}
    optional_url = os.getenv("POLYGUARD_OPTIONAL_LABEL_URL")
    if optional_url:
        payload["web_record"] = agent.acquire_web_knowledge(optional_url, offline_first=True)
    (out_dir / "ingested_sources.json").write_text(json.dumps(payload, ensure_ascii=True, indent=2), encoding="utf-8")
    print(f"ingested_records={len(records)}")


if __name__ == "__main__":
    main()