civicsetu / scripts /ingest_phase5.py
adeshboudh16
feat: Phase 5 complete β€” Karnataka RERA Rules 2017
fce9fad
# scripts/ingest_phase5.py
"""
Phase 5 ingestion β€” Karnataka RERA:
1. Karnataka RERA Rules 2017
Run:
uv run python scripts/ingest_phase5.py
"""
from __future__ import annotations
import asyncio
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
import structlog
from civicsetu.config.document_registry import DOCUMENT_REGISTRY
from civicsetu.ingestion.pipeline import IngestionPipeline
from civicsetu.ingestion.graph_seeder import GraphSeeder
log = structlog.get_logger(__name__)
# Karnataka Rule β†’ RERA Act Section (hand-mapped)
# Karnataka Rules 2017 implements RERA Act 2016 under Section 84
_KA_DERIVED_FROM_MAP = {
"3": "4", # Application for registration of project β†’ promoter obligations
"4": "5", # Grant / rejection of registration
"5": "6", # Extension of registration
"6": "7", # Revocation of registration
"7": "9", # Registration of real estate agents
"8": "9", # Grant / rejection of agent registration
"9": "10", # Functions of real estate agents
"10": "11", # Agreement for Sale
"11": "13", # Responsibility of promoter post-possession
"12": "17", # Conveyance deed
"13": "18", # Timelines for refund
"14": "19", # Rate of interest
"15": "25", # Functions of regulatory authority
"16": "31", # Complaints to authority
"17": "43", # Appellate Tribunal composition
"18": "58", # Offences by companies
"19": "66", # Compounding of offences
}
def main() -> None:
pipeline = IngestionPipeline()
spec = DOCUMENT_REGISTRY["karnataka_rera_rules_2017"]
log.info("phase5_ingest_start", doc=spec.name)
doc = pipeline.ingest_document(
source_url=spec.url,
doc_name=spec.name,
jurisdiction=spec.jurisdiction,
doc_type=spec.doc_type,
effective_date=spec.effective_date,
dest_subdir=spec.dest_subdir,
filename=spec.filename,
max_pages=spec.max_pages,
)
log.info("phase5_ingest_complete",
doc=spec.name, chunks=doc.total_chunks, doc_id=str(doc.doc_id))
if doc.total_chunks == 0:
print("\nβœ— ABORT: 0 chunks β€” PDF is likely fully scanned.")
print(" Fallback: use NAREDCO mirror and re-run.")
print(" URL: https://naredco.in/notification/pdfs/Karnataka%20Real%20Estate%20(Regulation%20and%20Development)%20Rules,%202017.pdf")
return
stats = asyncio.run(GraphSeeder.seed_from_postgres(doc_id=str(doc.doc_id)))
print("\n" + "=" * 60)
print("Phase 5 β€” Karnataka RERA ingestion complete")
print(f" Chunks : {doc.total_chunks}")
print(f" Graph stats : {stats}")
print("=" * 60)
if doc.total_chunks < 10:
print("\n⚠ WARNING: Very few chunks β€” check scanned_pages in log above")
else:
print("\nβœ“ Karnataka wired.")
if __name__ == "__main__":
main()