polyguard-openenv / scripts /ingest_open_drug_sources.py
TheJackBright's picture
Deploy PolyGuard OpenEnv Space
877add7 verified
#!/usr/bin/env python3
"""Offline-first source ingestion entrypoint."""
from __future__ import annotations
import json
import os
from pathlib import Path
from app.dataops.source_manager import DataAcquisitionAgent
def main() -> None:
root = Path(__file__).resolve().parents[1]
allow_domains = ["who.int", "nih.gov", "fda.gov", "ema.europa.eu"]
agent = DataAcquisitionAgent(root=root, allow_domains=allow_domains)
records = agent.acquire_local_knowledge()
out_dir = root / "data" / "processed"
out_dir.mkdir(parents=True, exist_ok=True)
payload: dict[str, object] = {"local_records": records}
optional_url = os.getenv("POLYGUARD_OPTIONAL_LABEL_URL")
if optional_url:
payload["web_record"] = agent.acquire_web_knowledge(optional_url, offline_first=True)
(out_dir / "ingested_sources.json").write_text(json.dumps(payload, ensure_ascii=True, indent=2), encoding="utf-8")
print(f"ingested_records={len(records)}")
if __name__ == "__main__":
main()