civicsetu / scripts /ingest_phase0.py
adeshboudh16
feat(phase3): cross-jurisdiction graph edges + DERIVED_FROM seeding
3461098
"""
Phase 0 ingestion: RERA Act 2016 (Central).
Idempotent — safe to re-run, upserts on conflict.
Run:
uv run python scripts/ingest_phase0.py
"""
from __future__ import annotations
import asyncio
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
import structlog
from civicsetu.config.document_registry import DOCUMENT_REGISTRY
from civicsetu.ingestion.pipeline import IngestionPipeline
from civicsetu.ingestion.graph_seeder import GraphSeeder
log = structlog.get_logger(__name__)
def main() -> None:
spec = DOCUMENT_REGISTRY["rera_act_2016"]
pipeline = IngestionPipeline()
doc = pipeline.ingest_document(
source_url=spec.url,
doc_name=spec.name,
jurisdiction=spec.jurisdiction,
doc_type=spec.doc_type,
effective_date=spec.effective_date,
dest_subdir=spec.dest_subdir,
filename=spec.filename,
)
log.info("phase0_ingest_complete", doc_id=str(doc.doc_id), chunks=doc.total_chunks)
stats = asyncio.run(GraphSeeder.seed_from_postgres(doc_id=str(doc.doc_id)))
log.info("phase0_graph_seed_complete", stats=stats)
if __name__ == "__main__":
main()