Spaces:

adesh01
/

civicsetu

Running

adeshboudh16

feat: Phase 2 multi-jurisdiction ingestion complete

7ea4089 about 2 months ago

2.12 kB

	# scripts/ingest_phase2.py
	"""
	Phase 2 ingestion: Maharashtra Real Estate Rules 2017.

	Run:
	uv run python scripts/ingest_phase2.py

	What it does:
	1. Downloads MahaRERA Rules 2017 PDF (cached after first run)
	2. Chunks using Rule-boundary regex
	3. Embeds + persists to Postgres/pgvector
	4. Re-seeds Neo4j (adds MahaRERA Section nodes + REFERENCES + DERIVED_FROM edges)
	"""
	from __future__ import annotations

	import asyncio
	import sys
	from pathlib import Path

	# Ensure src/ is on PYTHONPATH when run directly
	sys.path.insert(0, str(Path(__file__).parent.parent / "src"))

	import structlog

	from civicsetu.config.document_registry import DOCUMENT_REGISTRY
	from civicsetu.ingestion.pipeline import IngestionPipeline
	from civicsetu.ingestion.graph_seeder import GraphSeeder
	from civicsetu.stores.graph_store import _get_driver

	log = structlog.get_logger(__name__)


	def main() -> None:
	spec = DOCUMENT_REGISTRY["mahrera_rules_2017"]

	log.info("phase2_ingest_start", doc=spec.name, url=spec.url)

	# Step 1 — Ingest into Postgres + pgvector
	pipeline = IngestionPipeline()
	doc = pipeline.ingest_document(
	source_url=spec.url,
	doc_name=spec.name,
	jurisdiction=spec.jurisdiction,
	doc_type=spec.doc_type,
	effective_date=spec.effective_date,
	dest_subdir=spec.dest_subdir,
	filename=spec.filename,
	)

	log.info("phase2_ingest_complete", doc_id=str(doc.doc_id), chunks=doc.total_chunks)

	# Step 2 — Re-seed Neo4j for this document only
	log.info("phase2_graph_seed_start", doc_id=str(doc.doc_id))
	stats = asyncio.run(GraphSeeder.seed_from_postgres(doc_id=str(doc.doc_id)))
	log.info("phase2_graph_seed_complete", stats=stats)

	# Step 3 — Print summary
	print("\n" + "="*60)
	print("Phase 2 ingestion complete")
	print(f" Document : {spec.name}")
	print(f" Doc ID : {doc.doc_id}")
	print(f" Chunks : {doc.total_chunks}")
	print(f" Sections : {stats.get('sections')}")
	print(f" REFERENCES: {stats.get('refs')}")
	print("="*60)



	if __name__ == "__main__":
	main()