File size: 2,124 Bytes
7ea4089
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# scripts/ingest_phase2.py
"""
Phase 2 ingestion: Maharashtra Real Estate Rules 2017.

Run:
    uv run python scripts/ingest_phase2.py

What it does:
    1. Downloads MahaRERA Rules 2017 PDF (cached after first run)
    2. Chunks using Rule-boundary regex
    3. Embeds + persists to Postgres/pgvector
    4. Re-seeds Neo4j (adds MahaRERA Section nodes + REFERENCES + DERIVED_FROM edges)
"""
from __future__ import annotations

import asyncio
import sys
from pathlib import Path

# Ensure src/ is on PYTHONPATH when run directly
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))

import structlog

from civicsetu.config.document_registry import DOCUMENT_REGISTRY
from civicsetu.ingestion.pipeline import IngestionPipeline
from civicsetu.ingestion.graph_seeder import GraphSeeder
from civicsetu.stores.graph_store import _get_driver

log = structlog.get_logger(__name__)


def main() -> None:
    spec = DOCUMENT_REGISTRY["mahrera_rules_2017"]

    log.info("phase2_ingest_start", doc=spec.name, url=spec.url)

    # Step 1 — Ingest into Postgres + pgvector
    pipeline = IngestionPipeline()
    doc = pipeline.ingest_document(
        source_url=spec.url,
        doc_name=spec.name,
        jurisdiction=spec.jurisdiction,
        doc_type=spec.doc_type,
        effective_date=spec.effective_date,
        dest_subdir=spec.dest_subdir,
        filename=spec.filename,
    )

    log.info("phase2_ingest_complete", doc_id=str(doc.doc_id), chunks=doc.total_chunks)

    # Step 2 — Re-seed Neo4j for this document only
    log.info("phase2_graph_seed_start", doc_id=str(doc.doc_id))
    stats = asyncio.run(GraphSeeder.seed_from_postgres(doc_id=str(doc.doc_id)))
    log.info("phase2_graph_seed_complete", stats=stats)

    # Step 3 — Print summary
    print("\n" + "="*60)
    print("Phase 2 ingestion complete")
    print(f"   Document  : {spec.name}")
    print(f"   Doc ID    : {doc.doc_id}")
    print(f"   Chunks    : {doc.total_chunks}")
    print(f"   Sections  : {stats.get('sections')}")
    print(f"   REFERENCES: {stats.get('refs')}")
    print("="*60)



if __name__ == "__main__":
    main()