#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ VMware On-Prem → Azure Local Migration Assistant (Gradio) - Upload design/migration docs (PDF/DOCX/TXT/MD). - Ask questions; get reliable, detailed, and relevant answers. - Intent-aware (definitions | how-to | plans | comparisons) with topic-aware details. - No external APIs. No scikit-learn. Run locally: pip install gradio PyPDF2 python-docx python app.py """ import os import io import re import math from typing import List, Tuple, Dict, Any from collections import Counter import gradio as gr # ------------------------- # Optional parsers (graceful fallback) # ------------------------- try: import PyPDF2 except Exception: PyPDF2 = None try: import docx # python-docx except Exception: docx = None # ========================= # Trusted sources & FAQ seeds # ========================= TRUSTED_SOURCES: List[Tuple[str, str]] = [ # Core guidance ("Cloud Adoption Framework (CAF)", "https://learn.microsoft.com/azure/cloud-adoption-framework/"), ("Azure Well-Architected Framework (WAF)", "https://learn.microsoft.com/azure/architecture/framework/"), # Networking / SDN (used when question is about SDN) ("Azure Virtual Network", "https://learn.microsoft.com/azure/virtual-network/"), ("Azure SDN concepts (HCI)", "https://learn.microsoft.com/azure-stack/hci/concepts/software-defined-networking"), ("Azure Arc (overview)", "https://learn.microsoft.com/azure/azure-arc/"), ("Azure Stack HCI (Azure Local)", "https://learn.microsoft.com/azure-stack/hci/"), # Migration ("Azure VMware Solution (AVS)", "https://learn.microsoft.com/azure/azure-vmware/"), ("Azure Migrate", "https://learn.microsoft.com/azure/migrate/"), ("VMware HCX Docs", "https://docs.vmware.com/en/VMware-HCX/index.html"), # DR ("Azure Site Recovery (ASR)", "https://learn.microsoft.com/azure/site-recovery/"), # Security ("Microsoft Defender for Cloud", "https://learn.microsoft.com/azure/defender-for-cloud/"), # Cost ("Azure Cost Management", "https://learn.microsoft.com/azure/cost-management-billing/"), ] FAQ_SEEDS: List[Dict[str, Any]] = [ { "q": "migrate vmware workloads minimal downtime", "a": ( "For minimal downtime, favor AVS with HCX (vMotion/RAV) or Azure Migrate with staged replication. " "Prepare the landing zone first, validate connectivity (ExpressRoute/VPN, DNS, MTU), " "pilot a few representative VMs, then migrate in waves with rollback and DR drills." ), "refs": ["Azure VMware Solution (AVS)", "Azure Migrate", "VMware HCX Docs"], }, { "q": "recommended migration sequence", "a": ( "1) Establish a governed landing zone. 2) Set up connectivity and identity. " "3) Discover/assess with Azure Migrate. 4) Pilot 2–3 VMs. 5) Choose HCX or Azure Migrate cutover. " "6) Enforce security/monitoring. 7) Optimize cost and tag consistently." ), "refs": ["Cloud Adoption Framework (CAF)", "Azure Well-Architected Framework (WAF)"], }, { "q": "dr and backups planning", "a": ( "Define RTO/RPO per app. Use immutable backups and soft-delete. " "Leverage ASR for DR where appropriate, run failover drills, and document rollback." ), "refs": ["Azure Site Recovery (ASR)"], }, ] # ========================= # Utilities # ========================= _WORD_RE = re.compile(r"[A-Za-z0-9_.:/\-]+") def tokenize(text: str) -> List[str]: return [t.lower() for t in _WORD_RE.findall(text or "")] def list_refs(ref_names: List[str]) -> str: links = [] for nm in ref_names: hit = [x for x in TRUSTED_SOURCES if x[0] == nm] if hit: links.append(f"[{nm}]({hit[0][1]})") return " | ".join(links) if links else "" # ========================= # Intent & topic detection # ========================= _DEF_RE = re.compile(r"^\s*(what\s+is|what's|define|explain|tell\s+me\s+about)\b", re.I) _HOW_RE = re.compile(r"^\s*(how\s+do|how\s+to|how\s+does|how\s+can)\b", re.I) _CMP_RE = re.compile(r"\b(vs\.?|versus|compare|difference|differ)\b", re.I) _PLAN_RE = re.compile(r"\b(plan|approach|steps|roadmap|sequence|strategy)\b", re.I) def detect_intent(q: str) -> str: if _DEF_RE.search(q): return "define" if _CMP_RE.search(q): return "compare" if _PLAN_RE.search(q): return "plan" if _HOW_RE.search(q): return "how" return "general" def detect_topic(q: str) -> str: toks = set(tokenize(q)) if {"sdn", "software-defined", "softwaredefined"} & toks: return "sdn" if {"migrate", "migration", "hcx", "avs", "vmotion", "cutover"} & toks: return "migration" if {"dr", "disaster", "asr", "rto", "rpo", "failover"} & toks: return "dr" if {"defender", "sentinel", "pim", "mfa", "vault", "identity", "entra"} & toks: return "security" if {"cost", "reservation", "savings", "rightsizing", "tagging"} & toks: return "cost" return "general" def topic_refs(topic: str) -> List[str]: if topic == "sdn": return ["Azure Virtual Network", "Azure SDN concepts (HCI)", "Azure Arc (overview)", "Azure Stack HCI (Azure Local)"] if topic == "migration": return ["Azure Migrate", "Azure VMware Solution (AVS)", "VMware HCX Docs", "Cloud Adoption Framework (CAF)"] if topic == "dr": return ["Azure Site Recovery (ASR)", "Azure Well-Architected Framework (WAF)"] if topic == "security": return ["Microsoft Defender for Cloud", "Azure Well-Architected Framework (WAF)"] if topic == "cost": return ["Azure Cost Management", "Azure Well-Architected Framework (WAF)"] return ["Cloud Adoption Framework (CAF)", "Azure Well-Architected Framework (WAF)"] # ========================= # Tiny TF-IDF Index # ========================= class TinyTfidfIndex: def __init__(self): self.docs: List[List[str]] = [] self.df: Counter = Counter() self.idf: Dict[str, float] = {} self.doc_norms: List[float] = [] self.voc_size = 0 def add_documents(self, tokenized_docs: List[List[str]]): self.docs = tokenized_docs[:] self.df = Counter() for toks in self.docs: self.df.update(set(toks)) N = max(1, len(self.docs)) self.idf = {term: math.log((N + 1) / (df + 1)) + 1.0 for term, df in self.df.items()} self.voc_size = len(self.idf) self.doc_norms = [] for toks in self.docs: tf = Counter(toks) norm_sq = 0.0 for term, cnt in tf.items(): w = (cnt / max(1, len(toks))) * self.idf.get(term, 0.0) norm_sq += w * w self.doc_norms.append(math.sqrt(norm_sq)) def _vec(self, toks: List[str]) -> Dict[str, float]: tf = Counter(toks) total = max(1, len(toks)) v = {} for term, cnt in tf.items(): idf = self.idf.get(term) if idf is None: continue v[term] = (cnt / total) * idf return v def query(self, text: str, k: int = 5) -> List[Tuple[int, float]]: if not self.docs: return [] qv = self._vec(tokenize(text)) q_norm = math.sqrt(sum(w * w for w in qv.values())) or 1e-9 sims: List[Tuple[int, float]] = [] for i, toks in enumerate(self.docs): dv = Counter(toks) num = 0.0 for term in qv: if term in dv: w_d = (dv[term] / max(1, len(toks))) * self.idf.get(term, 0.0) num += qv[term] * w_d denom = (self.doc_norms[i] or 1e-9) * q_norm sims.append((i, num / denom)) sims.sort(key=lambda x: x[1], reverse=True) return sims[:k] # ========================= # File Parsing # ========================= def read_pdf_bytes(b: bytes) -> str: if not PyPDF2: return "" try: reader = PyPDF2.PdfReader(io.BytesIO(b)) return "\n".join([page.extract_text() or "" for page in reader.pages]) except Exception: return "" def read_docx_bytes(b: bytes) -> str: if not docx: return "" try: f = io.BytesIO(b) d = docx.Document(f) return "\n".join(p.text for p in d.paragraphs) except Exception: return "" def read_text_bytes(b: bytes) -> str: for enc in ("utf-8", "utf-16", "latin-1"): try: return b.decode(enc) except Exception: continue return "" def parse_file(file_obj: Dict[str, Any]) -> Dict[str, str]: name = file_obj.get("name") or file_obj.get("orig_name") or "uploaded" data = file_obj.get("data") if data is None: path = file_obj.get("path") if path and os.path.exists(path): with open(path, "rb") as fh: data = fh.read() if data is None: return {"file": name, "text": ""} low = name.lower() if low.endswith(".pdf"): text = read_pdf_bytes(data) elif low.endswith((".docx", ".doc")): text = read_docx_bytes(data) else: text = read_text_bytes(data) return {"file": os.path.basename(name), "text": text or ""} # ========================= # Strong definition composer (for “what is …”) # ========================= _DEF_RE_LEAD = re.compile(r"^\s*(what\s+is|what's|define|explain|tell\s+me\s+about)\s+", re.I) def _extract_subject_from_question(q: str) -> str: s = _DEF_RE_LEAD.sub("", q).strip() s = re.sub(r"[?.!]+$", "", s).strip() s = re.sub(r"^(an?|the)\s+", "", s, flags=re.I) return s if s else "the topic" def _definition_for_subject(subject: str, topic: str) -> Tuple[str, List[str], List[str], List[str], List[str], List[str]]: """ Returns: (definition, capabilities[], how[], best_practices[], use_cases[], refs_list) Provides a specific definition for SDN; otherwise a generic but detailed scaffold using the subject. """ # SDN-specific, as per your example (paraphrased, not reused verbatim for all topics) if topic == "sdn" or "sdn" in subject.lower(): definition = ( f"{subject} is Microsoft's implementation of software-defined networking: " "a model that shifts network control into software so you can centrally design, automate, " "and protect virtual networks across Azure and Azure Local (Azure Stack HCI). " "By separating the control plane from underlying hardware, it enables programmability and " "policy-driven management of components such as virtual networks, subnets, firewalls/ACLs, " "load balancers, and gateways—well-suited for dynamic cloud and hybrid environments." ) capabilities = [ "Programmatic creation of VNets, subnets, routing, and address spaces.", "Micro-segmentation and policy enforcement for east–west traffic.", "Software load balancing and gateway services for app connectivity.", "Consistency across Azure and Azure Local (Azure Stack HCI) via Azure Arc.", ] how = [ "A centralized control plane applies intent (network topology and policies) to host virtual switches.", "Agents/controllers translate intent into concrete configuration on each host.", "Telemetry and logs feed monitoring, governance, and troubleshooting workflows.", ] best = [ "Use Infrastructure-as-Code (Bicep/Terraform) and GitOps to standardize changes.", "Apply least-privilege and RBAC; review segmentation policies regularly.", "Integrate with logging/monitoring; alert on drift and policy violations.", ] uses = [ "Rapidly provisioning isolated app environments and tiers.", "Zero-trust segmentation between workloads and environments.", "Hybrid designs spanning Azure and Azure Local with consistent constructs.", ] refs_list = topic_refs("sdn") return definition, capabilities, how, best, uses, refs_list # Generic detailed definition for other subjects sub = subject.strip() definition = ( f"{sub} is a service/technology that centralizes control through software and policy so teams can " f"create, operate, and secure resources consistently across environments." ) capabilities = [ "Automation and policy-driven configuration to reduce manual effort and errors.", "Governance integration (RBAC, tagging, policy) for consistency and compliance.", "Observability hooks (logs/metrics) for reliability and performance tuning.", ] how = [ "A control plane captures intent (configuration/policies) and applies it to managed resources.", "Providers/agents on the platform translate intent into changes at runtime.", "Feedback loops via telemetry inform continuous improvement.", ] best = [ "Adopt Infrastructure-as-Code and peer reviews for change control.", "Define tagging, RBAC roles, and policy baselines early.", "Pilot in a non-prod environment before broad rollout.", ] uses = [ "Faster, repeatable environment provisioning.", "Improved security posture through standardized controls.", "Hybrid scenarios requiring consistent management across sites.", ] refs_list = topic_refs(detect_topic(sub)) return definition, capabilities, how, best, uses, refs_list def _compose_definition_markdown(query: str, subject: str, topic: str) -> str: definition, capabilities, how, best, uses, refs_list = _definition_for_subject(subject, topic) refs = list_refs(refs_list) md = [f"### {subject} — Detailed definition", f"**Your question:** {query}", "", f"**Definition:** {definition}", "", "**Key capabilities:**"] md += [f"- {c}" for c in capabilities] md += ["", "**How it works:**"] md += [f"- {h}" for h in how] md += ["", "**Best practices:**"] md += [f"- {b}" for b in best] md += ["", "**Common use cases:**"] md += [f"- {u}" for u in uses] md += ["", f"**Trusted sources:** {refs}"] return "\n".join(md) # ========================= # RAG: build a detailed answer from uploaded docs # ========================= def _extract_points(text: str, max_points: int = 6) -> List[str]: parts = re.split(r"(?<=[.!?])\s+", (text or "").strip()) pts = [] for p in parts: p = p.strip() if 40 <= len(p) <= 280 and p not in pts: pts.append(p) if len(pts) >= max_points: break return pts def _compose_rag_answer(query: str, snippets: List[str], topic: str) -> str: combined = " ".join(snippets) points = _extract_points(combined, max_points=6) refs = list_refs(topic_refs(topic)) md = ["### Answer (detailed)", f"**Your question:** {query}", ""] if points: md += ["**Executive summary:**"] + [f"- {p}" for p in points] else: md += ["**Executive summary:**", "- Here are key considerations synthesized from your documents."] # Add a short topic-aware checklist checklist = { "sdn": [ "Define VNets/subnets and segmentation policy.", "Automate with IaC (Bicep/Terraform) and GitOps.", "Harden east–west traffic with micro-segmentation.", "Plan ingress/egress with LBs and gateways." ], "migration": [ "Establish landing zone (Policy, RBAC, logging).", "Connect networks (ER/VPN), validate DNS/MTU.", "Discover/assess with Azure Migrate; pilot a few VMs.", "Choose HCX or Azure Migrate for cutover; migrate in waves." ], "dr": [ "Define RTO/RPO; choose replication targets.", "Run planned/unplanned failover drills.", "Ensure immutable backups and soft-delete." ], "security": [ "Enable RBAC/PIM/MFA and Key Vault.", "Turn on Defender for Cloud; set policies and alerts.", "Collect logs; restrict lateral movement." ], "cost": [ "Right-size; use Reservations/Savings Plans.", "Tag resources; set budgets/alerts.", "Automate non-prod shutdowns." ], "general": [ "Clarify objectives and constraints.", "Pilot changes; define rollback and verification." ] }.get(topic, ["Clarify objectives and constraints.", "Pilot changes; define rollback and verification."]) md += ["", "**Recommended steps:**"] + [f"- {s}" for s in checklist] md += ["", f"**Trusted sources:** {refs}"] return "\n".join(md) # ========================= # Main Answer Function # ========================= def answer_faq_or_approach_detailed(question: str, use_uploaded_docs: bool, index_obj: Any, _matrix_unused: Any, corpus: List[Dict[str,str]]) -> str: q = (question or "").strip() if not q: return "Please enter a question." intent = detect_intent(q) topic = detect_topic(q) # A) Definitions: build a strong, subject-specific definition (e.g., "What is Azure SDN?") if intent == "define": subject = _extract_subject_from_question(q) return _compose_definition_markdown(q, subject, topic) # B) Migration FAQs (only if the question is migration-like to avoid hijacking) q_tokens = set(tokenize(q)) if {"migrate", "migration", "hcx", "avs"} & q_tokens: for item in FAQ_SEEDS: seed_tokens = set(tokenize(item["q"])) if seed_tokens and (len(seed_tokens & q_tokens) / float(len(seed_tokens))) >= 0.5: return ( "### Answer (detailed)\n" f"{item['a']}\n\n" f"**Trusted sources:** {list_refs(item.get('refs', []))}" ) # C) RAG over uploaded docs → detailed synthesized answer if use_uploaded_docs and index_obj is not None and corpus: top = index_obj.query(q, k=6) snippets = [] for i, sim in top: item = corpus[i] excerpt = (item["text"] or "").strip() if len(excerpt) > 700: excerpt = excerpt[:700] + "..." if excerpt: snippets.append(excerpt) if snippets: return _compose_rag_answer(q, snippets, topic) # D) Topic-aware fallback (short but relevant) subject = _extract_subject_from_question(q) if intent in {"how", "plan", "compare"} else q return _compose_definition_markdown(q, subject, topic) # ========================= # Index Builder # ========================= def build_index(files: List[Dict[str, Any]]): if not files: return None, None, [], "No files uploaded yet." corpus = [parse_file(f) for f in files if parse_file(f)["text"]] if not corpus: return None, None, [], "No text extracted." tokenized = [tokenize(c["text"]) for c in corpus] idx = TinyTfidfIndex() idx.add_documents(tokenized) return idx, None, corpus, f"Indexed {len(corpus)} docs, vocab {idx.voc_size}." # ========================= # Gradio UI # ========================= with gr.Blocks(title="VMware → Azure Migration Assistant", fill_height=True) as demo: gr.Markdown( "## VMware On-Prem → Azure Local Migration Assistant\n" "- Upload documents (PDF/DOCX/TXT/MD)\n" "- Click **Build Index**\n" "- Ask a question. Answers are **detailed** and **topic-relevant**\n" ) with gr.Row(): with gr.Column(scale=2): file_in = gr.Files(label="Upload docs", file_count="multiple", type="filepath") index_status = gr.Markdown("No index yet.") st_index = gr.State(); st_matrix = gr.State(); st_corpus = gr.State() build_btn = gr.Button("Build Index", variant="primary") with gr.Column(scale=3): question = gr.Textbox( label="Ask a question", placeholder="e.g., What is Azure SDN? • How do I minimize downtime for our AVS migration?" ) use_docs = gr.Checkbox(label="Use uploaded docs (RAG)", value=True) ask_btn = gr.Button("Ask", variant="primary") answer_box = gr.Markdown("") def _collect_files(paths: List[str]): out = [] for p in paths or []: try: with open(p, "rb") as fh: data = fh.read() out.append({"name": os.path.basename(p), "data": data, "path": p}) except Exception: pass return out def _build(files_paths: List[str]): files = _collect_files(files_paths) return build_index(files) build_btn.click(_build, inputs=[file_in], outputs=[index_status, st_index, st_matrix, st_corpus]) ask_btn.click( answer_faq_or_approach_detailed, inputs=[question, use_docs, st_index, st_matrix, st_corpus], outputs=[answer_box] ) if __name__ == "__main__": IN_SPACES = bool(os.getenv("SPACE_ID") or os.getenv("HF_SPACE_ID")) demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)), share=not IN_SPACES)