"""Question-aware framing for the Capstone briefing opening.

The four-section structure (Status / Empirical / Modeled / Policy) is
load-bearing for the Mellea grounding checks and stays unchanged. What
this module does is detect the *shape* of the user's question from the
raw query string + planner intent, then return a single-sentence
directive that conditions only the opening Status sentence.

Eleven question types are recognised; they mirror the rubric in
`tests/integration/stakeholder_queries.py:FRAMING_RUBRICS`. Detection
is deterministic regex matching — no extra LLM call, no added latency.

Usage:

    from app.framing import augment_system_prompt
    system_prompt = augment_system_prompt(
        EXTRA_SYSTEM_PROMPT, query=user_query, intent=plan.intent,
    )

The returned prompt has the original text plus a trailing
`QUESTION-AWARE OPENING:` block. Granite 4.1 attends to this through
the system-prompt cache and applies it to the Status sentence.
"""
from __future__ import annotations

import re
from typing import Final

QUESTION_TYPES: Final[tuple[str, ...]] = (
    "habitability_decision",
    "legal_disclosure",
    "capital_planning",
    "underwriting",
    "journalism",
    "development_siting",
    "grant_evidence",
    "retrospective",
    "emergency_response",
    "comparison",
    "generic_exposure",
)


# ---- Per-type opening directives ------------------------------------------
#
# Each directive is one sentence that supplements (does not replace) the
# Status section's existing instruction. Granite 4.1 has a strong prior
# toward "this address is exposed to ..." openings; the directive
# overrides that in a question-shaped way without disturbing the four
# grounding invariants.

_DIRECTIVES: dict[str, str] = {
    "habitability_decision": (
        "The Status sentence MUST start with a direct verdict word "
        "(\"Yes\" if the documents show meaningful flood evidence, \"No\" "
        "if they don't), then name the single strongest piece of "
        "evidence with its [doc_id]. The user is deciding whether to "
        "live here — answer the question, then cite."
    ),
    "legal_disclosure": (
        "The Status sentence MUST state whether the documents contain "
        "facts a NY RPL §462(2) or §231-b disclosure would need to "
        "record. Begin with \"Disclosure is warranted\" or \"Disclosure "
        "is not triggered\" based on the evidence, then name the "
        "specific fact with its [doc_id]. The user is a real-estate "
        "professional checking the disclosure threshold."
    ),
    "capital_planning": (
        "The Status sentence MUST frame the place as a capital-planning "
        "candidate: name the dominant exposure with its [doc_id] and "
        "indicate whether the evidence supports prioritization "
        "(\"merits prioritization\", \"ranks high for hardening\") or "
        "not. The user allocates infrastructure investment."
    ),
    "underwriting": (
        "The Status sentence MUST emphasize that every figure in the "
        "briefing is independently sourced — open with the dominant "
        "exposure and the specific [doc_id], then add a half-clause "
        "noting that the audit chain follows below. The user is an "
        "underwriter who needs a defensible loss narrative."
    ),
    "journalism": (
        "The Status sentence MUST be reproducible reporting prose: "
        "name the place, name the dominant exposure with [doc_id], "
        "and avoid editorial verbs like \"shocking\" or \"alarming\". "
        "The user is a data journalist who will cite this prose verbatim."
    ),
    "development_siting": (
        "The Status sentence MUST start with the count of active "
        "construction filings cited from [dob_permits] (e.g. \"N "
        "active construction filings sit inside ...\") and indicate "
        "which flood layer they intersect. The user is a developer or "
        "architect doing a pre-design siting check."
    ),
    "grant_evidence": (
        "The Status sentence MUST open with \"Vulnerability "
        "assessment:\" and name the place + dominant exposure with "
        "[doc_id]. Treat the briefing as the evidence section of a "
        "HUD CDBG-DR or FEMA BRIC application — formal, third-person, "
        "free of advocacy framing."
    ),
    "retrospective": (
        "Riprap currently runs on present-day data sources. The Status "
        "sentence MUST acknowledge the question is retrospective and "
        "state explicitly that the briefing reflects the CURRENT state "
        "of these data sources, not a snapshot from the requested date. "
        "Then proceed with the present-day exposure picture so the user "
        "still gets the geography. Silence-over-confabulation: never "
        "reconstruct historical conditions you can't verify."
    ),
    "emergency_response": (
        "The Status sentence MUST quantify what is at risk in the "
        "next few hours, citing the live signal that triggered the "
        "query and any active alerts with [doc_id]. The user needs an "
        "operational picture, not a historical exposure summary."
    ),
    "comparison": (
        "The Status sentence MUST name BOTH places the user is "
        "comparing and indicate which one shows greater exposure on "
        "the strongest cited signal. If only one place's data is "
        "available in the documents, say so explicitly. The user is "
        "doing a head-to-head decision."
    ),
    "generic_exposure": "",  # default — no override
}


# ---- Detector -------------------------------------------------------------
#
# Patterns are ordered: the FIRST type whose pattern matches wins. Order
# matters — more specific question shapes (legal_disclosure, grant_evidence,
# emergency_response) come before more general ones (habitability_decision,
# capital_planning) so the obvious specialist tags don't get swallowed.

_PATTERNS: list[tuple[str, list[re.Pattern]]] = [
    ("retrospective", [
        re.compile(r"\b(would have|would Riprap|on (the )?date of|as of (the )?(date|day)|"
                   r"day before|prior to|before (Hurricane|Ida|Sandy|the storm)|"
                   r"on (August|September|October|November|December|January|February|March|"
                   r"April|May|June|July) \d{1,2},? ?\d{4}|"
                   r"time.?machine|retrospective|court (exhibit|testimony))\b", re.I),
    ]),
    ("emergency_response", [
        re.compile(r"\b(just triggered|right now|next (few |six |\d+ )?hours?|"
                   r"in the next \d+|currently flooding|flood (warning|watch) is active|"
                   r"sensor [A-Z]{2}-?\d+|live (alert|trigger))\b", re.I),
    ]),
    ("legal_disclosure", [
        re.compile(r"\b(disclos(e|ure|ed)|RPL\s*§?\s*\d+|Property Condition Disclosure|"
                   r"§\s*462|§\s*231-?b|seller'?s? disclosure|landlord'?s? disclosure|"
                   r"required to disclose|need to disclose)\b", re.I),
    ]),
    ("grant_evidence", [
        re.compile(r"\b(vulnerability assessment|CDBG-?DR|HUD|BRIC|"
                   r"grant application|funding application|community resilience grant|"
                   r"FEMA application|disaster recovery (application|funding))\b", re.I),
    ]),
    ("development_siting", [
        re.compile(r"\b(what (are|is) (they|being) build(ing)?|new construction|"
                   r"under construction|active (construction|filing|project|permit)|"
                   r"projects? (in progress|underway|planned)|architects?|"
                   r"siting check|pre.?design|"
                   r"DOB filing|developer)\b", re.I),
    ]),
    ("comparison", [
        # `prioritize X over Y` can have many words between, hence the
        # bounded non-greedy span — capped at 80 chars to avoid runaway.
        re.compile(r"\b(compare\b|comparison|\bvs\b|\bversus\b|"
                   r"head-?to-?head|\brank\s+the\s+top)\b", re.I),
        re.compile(r"\bprioritize\b.{1,80}\bover\b", re.I | re.S),
        re.compile(r"\bover\s+\w+(?:\s+\w+){0,3}\s+for\s+(hardening|investment)\b", re.I),
    ]),
    ("capital_planning", [
        re.compile(r"\b(prioritiz(e|ation)|capital plan(ning)?|harden(ing|s)?|"
                   r"infrastructure investment|where (should|to) (we |the )(invest|"
                   r"prioritize|harden)|MTA.+prioritize|DEP.+prioritize|"
                   r"protection envelope|outside (it|the protection)|"
                   r"resilien(ce|cy) project)\b", re.I),
    ]),
    ("habitability_decision", [
        re.compile(r"\b(should I worry|should I (be|consider)|is (it|this) safe|"
                   r"can I (rent|live|move|raise (my )?kids?)|considering (renting|leasing|moving)|"
                   r"(thinking about|planning to) (rent|lease|move|buy)|"
                   r"is (this|that|the landlord) true|landlord (says|claims|told)|"
                   r"no flood history|just got a lease|new lease|signing a lease|"
                   r"\bworry\b)", re.I),
    ]),
    ("underwriting", [
        re.compile(r"\b(underwrit(e|er|ing|able)|actuarial|loss history|"
                   r"insurabl[ey]|catastrophe (model|risk)|"
                   r"insurance (audit|memo|profile)|"
                   r"audit (chain|trail))\b", re.I),
    ]),
    ("journalism", [
        re.compile(r"\b(reporter|journalist|newsroom|story|coverage|"
                   r"published?|publish (this|the))", re.I),
    ]),
]


def detect(query: str, intent: str | None = None) -> str:
    """Classify the question shape from the raw query and planner intent.

    Returns one of `QUESTION_TYPES`. Falls back to `generic_exposure`
    when no pattern matches — that's the existing behavior, preserved.

    `intent` is currently advisory only (the patterns don't read it),
    but the parameter is part of the API so future refinements can
    use it (e.g. an `intent=neighborhood` query without a verdict
    keyword could default to `journalism` rather than `generic_exposure`).
    """
    if not query:
        return "generic_exposure"
    q = query.strip()
    for qt, patterns in _PATTERNS:
        if any(p.search(q) for p in patterns):
            return qt
    # Heuristic fallback: bare neighborhood/borough names from a planner
    # context default to journalism (most common stakeholder reading a
    # neighborhood-only query is a reporter or planner). For
    # single_address with no question keyword, fall back to generic.
    if intent == "neighborhood" and len(q.split()) <= 3:
        return "journalism"
    return "generic_exposure"


def opening_instruction(question_type: str) -> str:
    """Return the directive sentence(s) for a question type.
    Returns empty string for `generic_exposure` (no override)."""
    return _DIRECTIVES.get(question_type, "")


def augment_system_prompt(base: str, *, query: str,
                           intent: str | None = None) -> str:
    """Wrap a base system prompt with a question-aware opening directive.

    No-op when the detector returns `generic_exposure` — the original
    behavior is preserved.
    """
    qt = detect(query, intent)
    directive = opening_instruction(qt)
    if not directive:
        return base
    return (
        f"{base}\n\n"
        f"QUESTION-AWARE OPENING (this directive overrides ONLY the opening "
        f"**Status.** sentence; the four-section structure and citation "
        f"discipline above remain in force):\n{directive}"
    )