| |
| """ |
| Phase 5 ingestion β Karnataka RERA: |
| 1. Karnataka RERA Rules 2017 |
| |
| Run: |
| uv run python scripts/ingest_phase5.py |
| """ |
| from __future__ import annotations |
|
|
| import asyncio |
| import sys |
| from pathlib import Path |
|
|
| sys.path.insert(0, str(Path(__file__).parent.parent / "src")) |
|
|
| import structlog |
| from civicsetu.config.document_registry import DOCUMENT_REGISTRY |
| from civicsetu.ingestion.pipeline import IngestionPipeline |
| from civicsetu.ingestion.graph_seeder import GraphSeeder |
|
|
| log = structlog.get_logger(__name__) |
|
|
| |
| |
| _KA_DERIVED_FROM_MAP = { |
| "3": "4", |
| "4": "5", |
| "5": "6", |
| "6": "7", |
| "7": "9", |
| "8": "9", |
| "9": "10", |
| "10": "11", |
| "11": "13", |
| "12": "17", |
| "13": "18", |
| "14": "19", |
| "15": "25", |
| "16": "31", |
| "17": "43", |
| "18": "58", |
| "19": "66", |
| } |
|
|
|
|
| def main() -> None: |
| pipeline = IngestionPipeline() |
| spec = DOCUMENT_REGISTRY["karnataka_rera_rules_2017"] |
|
|
| log.info("phase5_ingest_start", doc=spec.name) |
| doc = pipeline.ingest_document( |
| source_url=spec.url, |
| doc_name=spec.name, |
| jurisdiction=spec.jurisdiction, |
| doc_type=spec.doc_type, |
| effective_date=spec.effective_date, |
| dest_subdir=spec.dest_subdir, |
| filename=spec.filename, |
| max_pages=spec.max_pages, |
| ) |
| log.info("phase5_ingest_complete", |
| doc=spec.name, chunks=doc.total_chunks, doc_id=str(doc.doc_id)) |
|
|
| if doc.total_chunks == 0: |
| print("\nβ ABORT: 0 chunks β PDF is likely fully scanned.") |
| print(" Fallback: use NAREDCO mirror and re-run.") |
| print(" URL: https://naredco.in/notification/pdfs/Karnataka%20Real%20Estate%20(Regulation%20and%20Development)%20Rules,%202017.pdf") |
| return |
|
|
| stats = asyncio.run(GraphSeeder.seed_from_postgres(doc_id=str(doc.doc_id))) |
|
|
| print("\n" + "=" * 60) |
| print("Phase 5 β Karnataka RERA ingestion complete") |
| print(f" Chunks : {doc.total_chunks}") |
| print(f" Graph stats : {stats}") |
| print("=" * 60) |
|
|
| if doc.total_chunks < 10: |
| print("\nβ WARNING: Very few chunks β check scanned_pages in log above") |
| else: |
| print("\nβ Karnataka wired.") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|