| |
| """Offline-first source ingestion entrypoint.""" |
|
|
| from __future__ import annotations |
|
|
| import json |
| import os |
| from pathlib import Path |
|
|
| from app.dataops.source_manager import DataAcquisitionAgent |
|
|
|
|
| def main() -> None: |
| root = Path(__file__).resolve().parents[1] |
| allow_domains = ["who.int", "nih.gov", "fda.gov", "ema.europa.eu"] |
| agent = DataAcquisitionAgent(root=root, allow_domains=allow_domains) |
| records = agent.acquire_local_knowledge() |
| out_dir = root / "data" / "processed" |
| out_dir.mkdir(parents=True, exist_ok=True) |
| payload: dict[str, object] = {"local_records": records} |
| optional_url = os.getenv("POLYGUARD_OPTIONAL_LABEL_URL") |
| if optional_url: |
| payload["web_record"] = agent.acquire_web_knowledge(optional_url, offline_first=True) |
| (out_dir / "ingested_sources.json").write_text(json.dumps(payload, ensure_ascii=True, indent=2), encoding="utf-8") |
| print(f"ingested_records={len(records)}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|