#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
VMware On-Prem → Azure Local Migration Assistant (Gradio)
- Upload design/migration docs (PDF/DOCX/TXT/MD).
- Ask questions; get reliable, detailed, and relevant answers.
- Intent-aware (definitions | how-to | plans | comparisons) with topic-aware details.
- No external APIs. No scikit-learn.

Run locally:
  pip install gradio PyPDF2 python-docx
  python app.py
"""

import os
import io
import re
import math
from typing import List, Tuple, Dict, Any
from collections import Counter

import gradio as gr

# -------------------------
# Optional parsers (graceful fallback)
# -------------------------
try:
    import PyPDF2
except Exception:
    PyPDF2 = None

try:
    import docx  # python-docx
except Exception:
    docx = None


# =========================
# Trusted sources & FAQ seeds
# =========================

TRUSTED_SOURCES: List[Tuple[str, str]] = [
    # Core guidance
    ("Cloud Adoption Framework (CAF)", "https://learn.microsoft.com/azure/cloud-adoption-framework/"),
    ("Azure Well-Architected Framework (WAF)", "https://learn.microsoft.com/azure/architecture/framework/"),
    # Networking / SDN (used when question is about SDN)
    ("Azure Virtual Network", "https://learn.microsoft.com/azure/virtual-network/"),
    ("Azure SDN concepts (HCI)", "https://learn.microsoft.com/azure-stack/hci/concepts/software-defined-networking"),
    ("Azure Arc (overview)", "https://learn.microsoft.com/azure/azure-arc/"),
    ("Azure Stack HCI (Azure Local)", "https://learn.microsoft.com/azure-stack/hci/"),
    # Migration
    ("Azure VMware Solution (AVS)", "https://learn.microsoft.com/azure/azure-vmware/"),
    ("Azure Migrate", "https://learn.microsoft.com/azure/migrate/"),
    ("VMware HCX Docs", "https://docs.vmware.com/en/VMware-HCX/index.html"),
    # DR
    ("Azure Site Recovery (ASR)", "https://learn.microsoft.com/azure/site-recovery/"),
    # Security
    ("Microsoft Defender for Cloud", "https://learn.microsoft.com/azure/defender-for-cloud/"),
    # Cost
    ("Azure Cost Management", "https://learn.microsoft.com/azure/cost-management-billing/"),
]

FAQ_SEEDS: List[Dict[str, Any]] = [
    {
        "q": "migrate vmware workloads minimal downtime",
        "a": (
            "For minimal downtime, favor AVS with HCX (vMotion/RAV) or Azure Migrate with staged replication. "
            "Prepare the landing zone first, validate connectivity (ExpressRoute/VPN, DNS, MTU), "
            "pilot a few representative VMs, then migrate in waves with rollback and DR drills."
        ),
        "refs": ["Azure VMware Solution (AVS)", "Azure Migrate", "VMware HCX Docs"],
    },
    {
        "q": "recommended migration sequence",
        "a": (
            "1) Establish a governed landing zone. 2) Set up connectivity and identity. "
            "3) Discover/assess with Azure Migrate. 4) Pilot 2–3 VMs. 5) Choose HCX or Azure Migrate cutover. "
            "6) Enforce security/monitoring. 7) Optimize cost and tag consistently."
        ),
        "refs": ["Cloud Adoption Framework (CAF)", "Azure Well-Architected Framework (WAF)"],
    },
    {
        "q": "dr and backups planning",
        "a": (
            "Define RTO/RPO per app. Use immutable backups and soft-delete. "
            "Leverage ASR for DR where appropriate, run failover drills, and document rollback."
        ),
        "refs": ["Azure Site Recovery (ASR)"],
    },
]


# =========================
# Utilities
# =========================

_WORD_RE = re.compile(r"[A-Za-z0-9_.:/\-]+")

def tokenize(text: str) -> List[str]:
    return [t.lower() for t in _WORD_RE.findall(text or "")]

def list_refs(ref_names: List[str]) -> str:
    links = []
    for nm in ref_names:
        hit = [x for x in TRUSTED_SOURCES if x[0] == nm]
        if hit:
            links.append(f"[{nm}]({hit[0][1]})")
    return " | ".join(links) if links else ""


# =========================
# Intent & topic detection
# =========================

_DEF_RE = re.compile(r"^\s*(what\s+is|what's|define|explain|tell\s+me\s+about)\b", re.I)
_HOW_RE = re.compile(r"^\s*(how\s+do|how\s+to|how\s+does|how\s+can)\b", re.I)
_CMP_RE = re.compile(r"\b(vs\.?|versus|compare|difference|differ)\b", re.I)
_PLAN_RE = re.compile(r"\b(plan|approach|steps|roadmap|sequence|strategy)\b", re.I)

def detect_intent(q: str) -> str:
    if _DEF_RE.search(q): return "define"
    if _CMP_RE.search(q): return "compare"
    if _PLAN_RE.search(q): return "plan"
    if _HOW_RE.search(q): return "how"
    return "general"

def detect_topic(q: str) -> str:
    toks = set(tokenize(q))
    if {"sdn", "software-defined", "softwaredefined"} & toks: return "sdn"
    if {"migrate", "migration", "hcx", "avs", "vmotion", "cutover"} & toks: return "migration"
    if {"dr", "disaster", "asr", "rto", "rpo", "failover"} & toks: return "dr"
    if {"defender", "sentinel", "pim", "mfa", "vault", "identity", "entra"} & toks: return "security"
    if {"cost", "reservation", "savings", "rightsizing", "tagging"} & toks: return "cost"
    return "general"

def topic_refs(topic: str) -> List[str]:
    if topic == "sdn":
        return ["Azure Virtual Network", "Azure SDN concepts (HCI)", "Azure Arc (overview)", "Azure Stack HCI (Azure Local)"]
    if topic == "migration":
        return ["Azure Migrate", "Azure VMware Solution (AVS)", "VMware HCX Docs", "Cloud Adoption Framework (CAF)"]
    if topic == "dr":
        return ["Azure Site Recovery (ASR)", "Azure Well-Architected Framework (WAF)"]
    if topic == "security":
        return ["Microsoft Defender for Cloud", "Azure Well-Architected Framework (WAF)"]
    if topic == "cost":
        return ["Azure Cost Management", "Azure Well-Architected Framework (WAF)"]
    return ["Cloud Adoption Framework (CAF)", "Azure Well-Architected Framework (WAF)"]


# =========================
# Tiny TF-IDF Index
# =========================

class TinyTfidfIndex:
    def __init__(self):
        self.docs: List[List[str]] = []
        self.df: Counter = Counter()
        self.idf: Dict[str, float] = {}
        self.doc_norms: List[float] = []
        self.voc_size = 0

    def add_documents(self, tokenized_docs: List[List[str]]):
        self.docs = tokenized_docs[:]
        self.df = Counter()
        for toks in self.docs:
            self.df.update(set(toks))
        N = max(1, len(self.docs))
        self.idf = {term: math.log((N + 1) / (df + 1)) + 1.0 for term, df in self.df.items()}
        self.voc_size = len(self.idf)
        self.doc_norms = []
        for toks in self.docs:
            tf = Counter(toks)
            norm_sq = 0.0
            for term, cnt in tf.items():
                w = (cnt / max(1, len(toks))) * self.idf.get(term, 0.0)
                norm_sq += w * w
            self.doc_norms.append(math.sqrt(norm_sq))

    def _vec(self, toks: List[str]) -> Dict[str, float]:
        tf = Counter(toks)
        total = max(1, len(toks))
        v = {}
        for term, cnt in tf.items():
            idf = self.idf.get(term)
            if idf is None:
                continue
            v[term] = (cnt / total) * idf
        return v

    def query(self, text: str, k: int = 5) -> List[Tuple[int, float]]:
        if not self.docs:
            return []
        qv = self._vec(tokenize(text))
        q_norm = math.sqrt(sum(w * w for w in qv.values())) or 1e-9
        sims: List[Tuple[int, float]] = []
        for i, toks in enumerate(self.docs):
            dv = Counter(toks)
            num = 0.0
            for term in qv:
                if term in dv:
                    w_d = (dv[term] / max(1, len(toks))) * self.idf.get(term, 0.0)
                    num += qv[term] * w_d
            denom = (self.doc_norms[i] or 1e-9) * q_norm
            sims.append((i, num / denom))
        sims.sort(key=lambda x: x[1], reverse=True)
        return sims[:k]


# =========================
# File Parsing
# =========================

def read_pdf_bytes(b: bytes) -> str:
    if not PyPDF2:
        return ""
    try:
        reader = PyPDF2.PdfReader(io.BytesIO(b))
        return "\n".join([page.extract_text() or "" for page in reader.pages])
    except Exception:
        return ""

def read_docx_bytes(b: bytes) -> str:
    if not docx:
        return ""
    try:
        f = io.BytesIO(b)
        d = docx.Document(f)
        return "\n".join(p.text for p in d.paragraphs)
    except Exception:
        return ""

def read_text_bytes(b: bytes) -> str:
    for enc in ("utf-8", "utf-16", "latin-1"):
        try:
            return b.decode(enc)
        except Exception:
            continue
    return ""

def parse_file(file_obj: Dict[str, Any]) -> Dict[str, str]:
    name = file_obj.get("name") or file_obj.get("orig_name") or "uploaded"
    data = file_obj.get("data")
    if data is None:
        path = file_obj.get("path")
        if path and os.path.exists(path):
            with open(path, "rb") as fh:
                data = fh.read()
    if data is None:
        return {"file": name, "text": ""}
    low = name.lower()
    if low.endswith(".pdf"):
        text = read_pdf_bytes(data)
    elif low.endswith((".docx", ".doc")):
        text = read_docx_bytes(data)
    else:
        text = read_text_bytes(data)
    return {"file": os.path.basename(name), "text": text or ""}


# =========================
# Strong definition composer (for “what is …”)
# =========================

_DEF_RE_LEAD = re.compile(r"^\s*(what\s+is|what's|define|explain|tell\s+me\s+about)\s+", re.I)

def _extract_subject_from_question(q: str) -> str:
    s = _DEF_RE_LEAD.sub("", q).strip()
    s = re.sub(r"[?.!]+$", "", s).strip()
    s = re.sub(r"^(an?|the)\s+", "", s, flags=re.I)
    return s if s else "the topic"

def _definition_for_subject(subject: str, topic: str) -> Tuple[str, List[str], List[str], List[str], List[str], List[str]]:
    """
    Returns: (definition, capabilities[], how[], best_practices[], use_cases[], refs_list)
    Provides a specific definition for SDN; otherwise a generic but detailed scaffold using the subject.
    """
    # SDN-specific, as per your example (paraphrased, not reused verbatim for all topics)
    if topic == "sdn" or "sdn" in subject.lower():
        definition = (
            f"{subject} is Microsoft's implementation of software-defined networking: "
            "a model that shifts network control into software so you can centrally design, automate, "
            "and protect virtual networks across Azure and Azure Local (Azure Stack HCI). "
            "By separating the control plane from underlying hardware, it enables programmability and "
            "policy-driven management of components such as virtual networks, subnets, firewalls/ACLs, "
            "load balancers, and gateways—well-suited for dynamic cloud and hybrid environments."
        )
        capabilities = [
            "Programmatic creation of VNets, subnets, routing, and address spaces.",
            "Micro-segmentation and policy enforcement for east–west traffic.",
            "Software load balancing and gateway services for app connectivity.",
            "Consistency across Azure and Azure Local (Azure Stack HCI) via Azure Arc.",
        ]
        how = [
            "A centralized control plane applies intent (network topology and policies) to host virtual switches.",
            "Agents/controllers translate intent into concrete configuration on each host.",
            "Telemetry and logs feed monitoring, governance, and troubleshooting workflows.",
        ]
        best = [
            "Use Infrastructure-as-Code (Bicep/Terraform) and GitOps to standardize changes.",
            "Apply least-privilege and RBAC; review segmentation policies regularly.",
            "Integrate with logging/monitoring; alert on drift and policy violations.",
        ]
        uses = [
            "Rapidly provisioning isolated app environments and tiers.",
            "Zero-trust segmentation between workloads and environments.",
            "Hybrid designs spanning Azure and Azure Local with consistent constructs.",
        ]
        refs_list = topic_refs("sdn")
        return definition, capabilities, how, best, uses, refs_list

    # Generic detailed definition for other subjects
    sub = subject.strip()
    definition = (
        f"{sub} is a service/technology that centralizes control through software and policy so teams can "
        f"create, operate, and secure resources consistently across environments."
    )
    capabilities = [
        "Automation and policy-driven configuration to reduce manual effort and errors.",
        "Governance integration (RBAC, tagging, policy) for consistency and compliance.",
        "Observability hooks (logs/metrics) for reliability and performance tuning.",
    ]
    how = [
        "A control plane captures intent (configuration/policies) and applies it to managed resources.",
        "Providers/agents on the platform translate intent into changes at runtime.",
        "Feedback loops via telemetry inform continuous improvement.",
    ]
    best = [
        "Adopt Infrastructure-as-Code and peer reviews for change control.",
        "Define tagging, RBAC roles, and policy baselines early.",
        "Pilot in a non-prod environment before broad rollout.",
    ]
    uses = [
        "Faster, repeatable environment provisioning.",
        "Improved security posture through standardized controls.",
        "Hybrid scenarios requiring consistent management across sites.",
    ]
    refs_list = topic_refs(detect_topic(sub))
    return definition, capabilities, how, best, uses, refs_list

def _compose_definition_markdown(query: str, subject: str, topic: str) -> str:
    definition, capabilities, how, best, uses, refs_list = _definition_for_subject(subject, topic)
    refs = list_refs(refs_list)
    md = [f"### {subject} — Detailed definition",
          f"**Your question:** {query}", "",
          f"**Definition:** {definition}", "",
          "**Key capabilities:**"]
    md += [f"- {c}" for c in capabilities]
    md += ["", "**How it works:**"]
    md += [f"- {h}" for h in how]
    md += ["", "**Best practices:**"]
    md += [f"- {b}" for b in best]
    md += ["", "**Common use cases:**"]
    md += [f"- {u}" for u in uses]
    md += ["", f"**Trusted sources:** {refs}"]
    return "\n".join(md)


# =========================
# RAG: build a detailed answer from uploaded docs
# =========================

def _extract_points(text: str, max_points: int = 6) -> List[str]:
    parts = re.split(r"(?<=[.!?])\s+", (text or "").strip())
    pts = []
    for p in parts:
        p = p.strip()
        if 40 <= len(p) <= 280 and p not in pts:
            pts.append(p)
        if len(pts) >= max_points:
            break
    return pts

def _compose_rag_answer(query: str, snippets: List[str], topic: str) -> str:
    combined = " ".join(snippets)
    points = _extract_points(combined, max_points=6)
    refs = list_refs(topic_refs(topic))
    md = ["### Answer (detailed)", f"**Your question:** {query}", ""]
    if points:
        md += ["**Executive summary:**"] + [f"- {p}" for p in points]
    else:
        md += ["**Executive summary:**", "- Here are key considerations synthesized from your documents."]
    # Add a short topic-aware checklist
    checklist = {
        "sdn": [
            "Define VNets/subnets and segmentation policy.",
            "Automate with IaC (Bicep/Terraform) and GitOps.",
            "Harden east–west traffic with micro-segmentation.",
            "Plan ingress/egress with LBs and gateways."
        ],
        "migration": [
            "Establish landing zone (Policy, RBAC, logging).",
            "Connect networks (ER/VPN), validate DNS/MTU.",
            "Discover/assess with Azure Migrate; pilot a few VMs.",
            "Choose HCX or Azure Migrate for cutover; migrate in waves."
        ],
        "dr": [
            "Define RTO/RPO; choose replication targets.",
            "Run planned/unplanned failover drills.",
            "Ensure immutable backups and soft-delete."
        ],
        "security": [
            "Enable RBAC/PIM/MFA and Key Vault.",
            "Turn on Defender for Cloud; set policies and alerts.",
            "Collect logs; restrict lateral movement."
        ],
        "cost": [
            "Right-size; use Reservations/Savings Plans.",
            "Tag resources; set budgets/alerts.",
            "Automate non-prod shutdowns."
        ],
        "general": [
            "Clarify objectives and constraints.",
            "Pilot changes; define rollback and verification."
        ]
    }.get(topic, ["Clarify objectives and constraints.", "Pilot changes; define rollback and verification."])
    md += ["", "**Recommended steps:**"] + [f"- {s}" for s in checklist]
    md += ["", f"**Trusted sources:** {refs}"]
    return "\n".join(md)


# =========================
# Main Answer Function
# =========================

def answer_faq_or_approach_detailed(question: str, use_uploaded_docs: bool, index_obj: Any, _matrix_unused: Any, corpus: List[Dict[str,str]]) -> str:
    q = (question or "").strip()
    if not q:
        return "Please enter a question."

    intent = detect_intent(q)
    topic = detect_topic(q)

    # A) Definitions: build a strong, subject-specific definition (e.g., "What is Azure SDN?")
    if intent == "define":
        subject = _extract_subject_from_question(q)
        return _compose_definition_markdown(q, subject, topic)

    # B) Migration FAQs (only if the question is migration-like to avoid hijacking)
    q_tokens = set(tokenize(q))
    if {"migrate", "migration", "hcx", "avs"} & q_tokens:
        for item in FAQ_SEEDS:
            seed_tokens = set(tokenize(item["q"]))
            if seed_tokens and (len(seed_tokens & q_tokens) / float(len(seed_tokens))) >= 0.5:
                return (
                    "### Answer (detailed)\n"
                    f"{item['a']}\n\n"
                    f"**Trusted sources:** {list_refs(item.get('refs', []))}"
                )

    # C) RAG over uploaded docs → detailed synthesized answer
    if use_uploaded_docs and index_obj is not None and corpus:
        top = index_obj.query(q, k=6)
        snippets = []
        for i, sim in top:
            item = corpus[i]
            excerpt = (item["text"] or "").strip()
            if len(excerpt) > 700:
                excerpt = excerpt[:700] + "..."
            if excerpt:
                snippets.append(excerpt)
        if snippets:
            return _compose_rag_answer(q, snippets, topic)

    # D) Topic-aware fallback (short but relevant)
    subject = _extract_subject_from_question(q) if intent in {"how", "plan", "compare"} else q
    return _compose_definition_markdown(q, subject, topic)


# =========================
# Index Builder
# =========================

def build_index(files: List[Dict[str, Any]]):
    if not files:
        return None, None, [], "No files uploaded yet."
    corpus = [parse_file(f) for f in files if parse_file(f)["text"]]
    if not corpus:
        return None, None, [], "No text extracted."
    tokenized = [tokenize(c["text"]) for c in corpus]
    idx = TinyTfidfIndex()
    idx.add_documents(tokenized)
    return idx, None, corpus, f"Indexed {len(corpus)} docs, vocab {idx.voc_size}."


# =========================
# Gradio UI
# =========================

with gr.Blocks(title="VMware → Azure Migration Assistant", fill_height=True) as demo:
    gr.Markdown(
        "## VMware On-Prem → Azure Local Migration Assistant\n"
        "- Upload documents (PDF/DOCX/TXT/MD)\n"
        "- Click **Build Index**\n"
        "- Ask a question. Answers are **detailed** and **topic-relevant**\n"
    )
    with gr.Row():
        with gr.Column(scale=2):
            file_in = gr.Files(label="Upload docs", file_count="multiple", type="filepath")
            index_status = gr.Markdown("No index yet.")
            st_index = gr.State(); st_matrix = gr.State(); st_corpus = gr.State()
            build_btn = gr.Button("Build Index", variant="primary")
        with gr.Column(scale=3):
            question = gr.Textbox(
                label="Ask a question",
                placeholder="e.g., What is Azure SDN?  •  How do I minimize downtime for our AVS migration?"
            )
            use_docs = gr.Checkbox(label="Use uploaded docs (RAG)", value=True)
            ask_btn = gr.Button("Ask", variant="primary")
            answer_box = gr.Markdown("")

    def _collect_files(paths: List[str]):
        out = []
        for p in paths or []:
            try:
                with open(p, "rb") as fh:
                    data = fh.read()
                out.append({"name": os.path.basename(p), "data": data, "path": p})
            except Exception:
                pass
        return out

    def _build(files_paths: List[str]):
        files = _collect_files(files_paths)
        return build_index(files)

    build_btn.click(_build, inputs=[file_in], outputs=[index_status, st_index, st_matrix, st_corpus])

    ask_btn.click(
        answer_faq_or_approach_detailed,
        inputs=[question, use_docs, st_index, st_matrix, st_corpus],
        outputs=[answer_box]
    )

if __name__ == "__main__":
    IN_SPACES = bool(os.getenv("SPACE_ID") or os.getenv("HF_SPACE_ID"))
    demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)), share=not IN_SPACES)