civicsetu / scripts /ingest_phase2.py
adeshboudh16
feat: Phase 2 multi-jurisdiction ingestion complete
7ea4089
# scripts/ingest_phase2.py
"""
Phase 2 ingestion: Maharashtra Real Estate Rules 2017.
Run:
uv run python scripts/ingest_phase2.py
What it does:
1. Downloads MahaRERA Rules 2017 PDF (cached after first run)
2. Chunks using Rule-boundary regex
3. Embeds + persists to Postgres/pgvector
4. Re-seeds Neo4j (adds MahaRERA Section nodes + REFERENCES + DERIVED_FROM edges)
"""
from __future__ import annotations
import asyncio
import sys
from pathlib import Path
# Ensure src/ is on PYTHONPATH when run directly
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
import structlog
from civicsetu.config.document_registry import DOCUMENT_REGISTRY
from civicsetu.ingestion.pipeline import IngestionPipeline
from civicsetu.ingestion.graph_seeder import GraphSeeder
from civicsetu.stores.graph_store import _get_driver
log = structlog.get_logger(__name__)
def main() -> None:
spec = DOCUMENT_REGISTRY["mahrera_rules_2017"]
log.info("phase2_ingest_start", doc=spec.name, url=spec.url)
# Step 1 — Ingest into Postgres + pgvector
pipeline = IngestionPipeline()
doc = pipeline.ingest_document(
source_url=spec.url,
doc_name=spec.name,
jurisdiction=spec.jurisdiction,
doc_type=spec.doc_type,
effective_date=spec.effective_date,
dest_subdir=spec.dest_subdir,
filename=spec.filename,
)
log.info("phase2_ingest_complete", doc_id=str(doc.doc_id), chunks=doc.total_chunks)
# Step 2 — Re-seed Neo4j for this document only
log.info("phase2_graph_seed_start", doc_id=str(doc.doc_id))
stats = asyncio.run(GraphSeeder.seed_from_postgres(doc_id=str(doc.doc_id)))
log.info("phase2_graph_seed_complete", stats=stats)
# Step 3 — Print summary
print("\n" + "="*60)
print("Phase 2 ingestion complete")
print(f" Document : {spec.name}")
print(f" Doc ID : {doc.doc_id}")
print(f" Chunks : {doc.total_chunks}")
print(f" Sections : {stats.get('sections')}")
print(f" REFERENCES: {stats.get('refs')}")
print("="*60)
if __name__ == "__main__":
main()