Spaces:

Parthiban97
/

MF

Sleeping

App Files Files Community

Parthiban97 commited on Mar 9

Commit

b0e15c1

verified ·

1 Parent(s): 35d4c56

Upload 15 files

Browse files

Files changed (15) hide show

.streamlit/config.toml +7 -0
app.py +509 -0
requirements.txt +17 -0
src/__init__.py +1 -0
src/charts.py +314 -0
src/csv_enrichment.py +941 -0
src/data_engine.py +1210 -0
src/index_fund_ingest.py +354 -0
src/models.py +132 -0
src/nav_metrics_engine.py +1005 -0
src/pdf_generator.py +560 -0
src/portfolio_engine.py +299 -0
src/reference_data.py +142 -0
src/scheme_resolver.py +323 -0
src/weightage.py +425 -0

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,7 @@

+[theme]
+base = "dark"
+primaryColor = "#4A90E2"
+backgroundColor = "#0f0f0f"
+secondaryBackgroundColor = "#1a1a1a"
+textColor = "#e5e5e5"
+font = "sans serif"

app.py ADDED Viewed

	@@ -0,0 +1,509 @@

+from __future__ import annotations
+import io
+import sys
+import tempfile
+import time
+import traceback
+from contextlib import redirect_stderr, redirect_stdout
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+import streamlit as st
+from src.csv_enrichment import (
+    TARGET_COLUMNS,
+    EnrichmentConfig,
+    enrich_csv,                        # use canonical name (alias also works)
+    lookup_fund_metric_value,
+)
+from src.data_engine import run_data_engine
+# ── Session logging ───────────────────────────────────────────────────────────
+def _init_session_log() -> Path:
+    if "session_log_path" not in st.session_state:
+        log_dir = Path("logs") / "streamlit_sessions"
+        log_dir.mkdir(parents=True, exist_ok=True)
+        stamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
+        log_path = log_dir / f"session_{stamp}.log"
+        log_path.write_text(
+            f"[{datetime.now().isoformat()}] session_started\n",
+            encoding="utf-8",
+        )
+        st.session_state["session_log_path"] = str(log_path)
+    return Path(st.session_state["session_log_path"])
+def _log_session_event(message: str) -> None:
+    try:
+        log_path = _init_session_log()
+        with log_path.open("a", encoding="utf-8") as f:
+            f.write(f"[{datetime.now().isoformat()}] {message}\n")
+    except Exception:
+        pass
+def _log_session_block(title: str, content: str) -> None:
+    try:
+        log_path = _init_session_log()
+        with log_path.open("a", encoding="utf-8") as f:
+            f.write(f"[{datetime.now().isoformat()}] --- {title} (start) ---\n")
+            f.write((content.rstrip() + "\n") if content.strip() else "(no output)\n")
+            f.write(f"[{datetime.now().isoformat()}] --- {title} (end) ---\n")
+    except Exception:
+        pass
+# ── Captured output runner ────────────────────────────────────────────────────
+def _run_with_captured_output(func: Any, *args: Any, **kwargs: Any) -> tuple[Any, str]:
+    """Run function, mirror prints to terminal, capture for UI display."""
+    class _TeeCapture(io.TextIOBase):
+        def __init__(self, mirror: Any, on_write: Any = None) -> None:
+            self._mirror  = mirror
+            self._buffer  = io.StringIO()
+            self._on_write = on_write
+        def write(self, s: str) -> int:
+            text = str(s)
+            self._buffer.write(text)
+            try:
+                self._mirror.write(text)
+                self._mirror.flush()
+            except Exception:
+                pass
+            if self._on_write is not None:
+                try:
+                    self._on_write(text)
+                except Exception:
+                    pass
+            return len(text)
+        def flush(self) -> None:
+            try:
+                self._mirror.flush()
+            except Exception:
+                pass
+        def getvalue(self) -> str:
+            return self._buffer.getvalue()
+    live_callback = kwargs.pop("live_callback", None)
+    out_tee = _TeeCapture(sys.__stdout__, live_callback)
+    err_tee = _TeeCapture(sys.__stderr__, live_callback)
+    with redirect_stdout(out_tee), redirect_stderr(err_tee):
+        result = func(*args, **kwargs)
+    return result, out_tee.getvalue() + err_tee.getvalue()
+# ── CSS ───────────────────────────────────────────────────────────────────────
+def _inject_custom_css() -> None:
+    st.markdown(
+        """
+        <style>
+        :root {
+            --mf-primary: #4A90E2;
+            --mf-accent: #22c55e;
+            --mf-bg: #0f0f0f;
+            --mf-bg-secondary: #1a1a1a;
+            --mf-surface: #1a1a1a;
+            --mf-text: #e5e5e5;
+            --mf-text-muted: #a0a0a0;
+            --mf-border: #333333;
+        }
+        .mf-shell { max-width: 1100px; margin: 0 auto; padding: 0 0 3rem 0; }
+        .mf-hero {
+            padding: 1.9rem 2.1rem 1.5rem 2.1rem;
+            border-radius: 18px;
+            background: var(--mf-bg-secondary);
+            border: 1px solid var(--mf-border);
+        }
+        .mf-kicker {
+            letter-spacing: .16em; font-size: 0.75rem;
+            text-transform: uppercase; color: var(--mf-primary); margin-bottom: 0.5rem;
+        }
+        .mf-title {
+            font-size: 2.2rem; font-weight: 650;
+            line-height: 1.1; color: var(--mf-text); margin-bottom: 0.75rem;
+        }
+        .mf-subtitle { max-width: 40rem; font-size: 0.95rem; color: var(--mf-text-muted); }
+        .mf-panel {
+            margin-top: 1.75rem; padding: 1.5rem 1.75rem 1.75rem 1.75rem;
+            border-radius: 20px; background: var(--mf-surface);
+            border: 1px solid var(--mf-border);
+        }
+        .mf-helper { font-size: 0.8rem; color: var(--mf-text-muted); margin-bottom: 0.9rem; }
+        .mf-steps { font-size: 0.78rem; color: var(--mf-text-muted); margin-top: 0.3rem; }
+        .mf-steps li { margin-bottom: 0.1rem; }
+        .mf-metrics { display: flex; flex-wrap: wrap; gap: 0.75rem; margin-top: 1.25rem; }
+        .mf-metric {
+            flex: 0 0 auto; min-width: 140px; padding: 0.6rem 0.8rem;
+            border-radius: 0.9rem; border: 1px solid var(--mf-border);
+            background: var(--mf-bg-secondary);
+        }
+        .mf-metric-label {
+            font-size: 0.72rem; text-transform: uppercase;
+            letter-spacing: 0.09em; color: var(--mf-text-muted); margin-bottom: 0.2rem;
+        }
+        .mf-metric-value { font-size: 1.05rem; font-weight: 600; color: var(--mf-accent); }
+        .mf-timing {
+            margin-top: 1rem; padding: 0.75rem 1rem;
+            border-radius: 0.75rem; border: 1px solid var(--mf-border);
+            background: var(--mf-bg-secondary); font-size: 0.8rem;
+            color: var(--mf-text-muted);
+        }
+        .mf-download-label {
+            font-size: 0.8rem; color: var(--mf-text-muted);
+            margin-top: 1.4rem; margin-bottom: 0.35rem;
+        }
+        .stFileUploader div[data-testid="stFileUploaderDropzone"] {
+            border-radius: 0.9rem; border-color: var(--mf-border);
+            background: var(--mf-bg-secondary);
+        }
+        .stButton > button[kind="primary"], .stDownloadButton > button {
+            border-radius: 0.5rem; border: none;
+            background: var(--mf-primary) !important;
+            color: white !important; font-weight: 600;
+        }
+        .stApp, [data-testid="stAppViewContainer"] { background-color: var(--mf-bg); }
+        .block-container { padding-top: 1.5rem; }
+        @media (max-width: 768px) {
+            .mf-hero { padding: 1.4rem 1.3rem 1.2rem 1.3rem; }
+            .mf-title { font-size: 1.6rem; }
+        }
+        </style>
+        """,
+        unsafe_allow_html=True,
+    )
+# ── Main ──────────────────────────────────────────────────────────────────────
+def main() -> None:
+    st.set_page_config(
+        page_title="MF Scoring Engine · Advisor Demo",
+        page_icon="📈",
+        layout="centered",
+    )
+    _inject_custom_css()
+    _init_session_log()
+    _log_session_event("app_rendered")
+    st.markdown('<div class="mf-shell">', unsafe_allow_html=True)
+    st.markdown(
+        """
+        <section class="mf-hero">
+          <div class="mf-kicker">Advisor tool</div>
+          <div class="mf-title">Score your mutual fund list in Excel.</div>
+          <p class="mf-subtitle">
+            Upload your mutual fund CSV. The app runs enrichment (NAV engine → web fallback → median),
+            scores every fund, and gives you a ready-to-share Excel workbook.
+          </p>
+        </section>
+        """,
+        unsafe_allow_html=True,
+    )
+    st.markdown('<section class="mf-panel">', unsafe_allow_html=True)
+    tab_run, tab_about = st.tabs(["Run analysis", "How scoring works"])
+    with tab_run:
+        st.markdown("### Upload CSV & generate workbook")
+        st.markdown(
+            """
+            <p class="mf-helper">
+              Upload your standard fund universe CSV
+              (<code>Fund</code>, <code>Benchmark Type</code>, CAGR columns, etc.).<br>
+              <strong>Firecrawl/Tavily is used only for missing P/E and P/B</strong> —
+              all risk metrics (Alpha, Sharpe, Sortino, etc.) are computed directly from NAV history.
+            </p>
+            """,
+            unsafe_allow_html=True,
+        )
+        uploaded_file = st.file_uploader(
+            "Step 1 · Upload fund universe CSV",
+            type=["csv"],
+            help="Same CSV you feed into the offline data engine.",
+        )
+        if uploaded_file is not None:
+            st.caption(
+                f"Selected: **{uploaded_file.name}** · "
+                f"{(len(uploaded_file.getbuffer()) / 1024):.1f} KB"
+            )
+            _log_session_event(
+                f"uploaded_file name={uploaded_file.name} "
+                f"size_kb={(len(uploaded_file.getbuffer())/1024):.1f}"
+            )
+        st.info(
+            "Pipeline: **Scheme code resolution → NAV engine (parallel, 12 workers) "
+            "→ PE/PB web lookup → category median fallback → scoring engine**"
+        )
+        st.markdown(
+            """
+            <ul class="mf-steps">
+              <li>1 — Upload your latest CSV export.</li>
+              <li>2 — Click <strong>Run analysis</strong> and watch live logs.</li>
+              <li>3 — Download the scored Excel when complete.</li>
+            </ul>
+            """,
+            unsafe_allow_html=True,
+        )
+        run_clicked = st.button(
+            "Step 2 · Run analysis",
+            type="primary",
+            use_container_width=True,
+            disabled=uploaded_file is None,
+        )
+        # ── State carried across rerun ─────────────────────────────────────
+        generated_bytes:    io.BytesIO | None = None
+        generated_filename: str | None        = None
+        funds_count:        int | None        = None
+        categories_count:   int | None        = None
+        enrichment_summary: str | None        = None
+        timing_html:        str | None        = None
+        if run_clicked:
+            _log_session_event("run_analysis_clicked")
+            if uploaded_file is None:
+                st.warning("Please upload a CSV file first.")
+                _log_session_event("run_aborted_no_upload")
+            else:
+                base_stem  = Path(uploaded_file.name).stem
+                stamp      = datetime.now().strftime("%Y%m%d_%H%M%S")
+                input_stem = f"{base_stem}_{stamp}"
+                with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
+                    tmp.write(uploaded_file.getbuffer())
+                    input_path = Path(tmp.name)
+                out_dir       = Path("output")
+                out_dir.mkdir(exist_ok=True)
+                generated_path = out_dir / f"fund_analysis_{input_stem}.xlsx"
+                t_total_start = time.perf_counter()
+                try:
+                    with st.status("Processing…", expanded=True) as status:
+                        live_lines: list[str] = []
+                        live_box = st.empty()
+                        # Noise patterns to suppress from the live log box
+                        _SUPPRESS = (
+                            "missing ScriptRunContext",
+                            "FutureWarning",
+                            "Passing literal json",
+                            "To read from a literal string",
+                            "return pd.read_json",
+                        )
+                        def _live_sink(chunk: str) -> None:
+                            clean = chunk.replace("\r", "")
+                            new = [
+                                ln for ln in clean.split("\n")
+                                if ln.strip()
+                                and not any(s in ln for s in _SUPPRESS)
+                            ]
+                            if not new:
+                                return
+                            live_lines.extend(new)
+                            if len(live_lines) > 50:
+                                del live_lines[:-50]
+                            live_box.code("\n".join(live_lines), language="text")
+                        # ── Phase 1: Enrichment ────────────────────────────
+                        st.write("**1/2  Enrichment** — scheme codes → NAV engine → PE/PB → medians…")
+                        t_enrich_start = time.perf_counter()
+                        enrichment, enrich_output = _run_with_captured_output(
+                            enrich_csv,
+                            str(input_path),
+                            config=EnrichmentConfig(
+                                enabled=True,
+                                max_cells=None,
+                                min_confidence=0.65,
+                                resolve_scheme_codes=True,   # ← parallel scheme resolution
+                                enable_nav_engine=True,      # ← parallel NAV engine (12 workers)
+                                web_search_pe_pb_only=True,  # ← only PE/PB uses API credits
+                                impute_unresolved=True,
+                            ),
+                            live_callback=_live_sink,
+                        )
+                        t_enrich_end = time.perf_counter()
+                        enrich_secs  = t_enrich_end - t_enrich_start
+                        _log_session_block("enrichment_output", enrich_output)
+                        _log_session_event(
+                            f"enrichment_done "
+                            f"checked={enrichment.examined_cells} "
+                            f"nav={enrichment.nav_cells} "
+                            f"web={enrichment.web_cells} "
+                            f"imputed={enrichment.imputed_cells} "
+                            f"skipped={enrichment.skipped_cells} "
+                            f"codes={enrichment.resolved_codes} "
+                            f"secs={enrich_secs:.1f}"
+                        )
+                        st.write(
+                            f"   ✅ Enrichment done in **{enrich_secs:.0f}s** — "
+                            f"checked {enrichment.examined_cells} cells, "
+                            f"NAV filled {enrichment.nav_cells}, "
+                            f"web filled {enrichment.web_cells}, "
+                            f"imputed {enrichment.imputed_cells}"
+                        )
+                        pipeline_input_path = Path(enrichment.enriched_csv_path)
+                        # ── Phase 2: Scoring + Excel ───────────────────────
+                        st.write("**2/2  Scoring engine** — computing scores, ranking, generating Excel…")
+                        t_engine_start = time.perf_counter()
+                        funds, engine_output = _run_with_captured_output(
+                            run_data_engine,
+                            csv_path=str(pipeline_input_path),
+                            output_path=str(generated_path),
+                            use_comprehensive_scoring=True,
+                            live_callback=_live_sink,
+                        )
+                        t_engine_end = time.perf_counter()
+                        engine_secs  = t_engine_end - t_engine_start
+                        total_secs   = time.perf_counter() - t_total_start
+                        _log_session_block("engine_output", engine_output)
+                        _log_session_event(
+                            f"engine_done funds={len(funds)} "
+                            f"secs={engine_secs:.1f} total={total_secs:.1f}"
+                        )
+                        st.write(
+                            f"   ✅ Scoring done in **{engine_secs:.0f}s** — "
+                            f"{len(funds)} funds scored"
+                        )
+                        status.update(
+                            label=f"✅ Complete — {total_secs:.0f}s total",
+                            state="complete",
+                            expanded=False,
+                        )
+                except Exception as exc:
+                    err_text = "".join(traceback.format_exception(exc))
+                    _log_session_block("run_failure", err_text)
+                    _log_session_event(f"run_failed error={exc}")
+                    st.error("Run failed. See terminal for traceback.")
+                    st.code(err_text, language="text")
+                    return
+                # ── Summary ────────────────────────────────────────────────
+                if enrichment.errors:
+                    st.warning("Enrichment completed with warnings — check scratchpad for details.")
+                if enrichment.scratchpad_path:
+                    st.caption(f"Scratchpad: `{enrichment.scratchpad_path}`")
+                enrichment_summary = (
+                    f"Enrichment: {enrichment.examined_cells} cells checked — "
+                    f"NAV filled {enrichment.nav_cells}, "
+                    f"web filled {enrichment.web_cells}, "
+                    f"imputed {enrichment.imputed_cells}, "
+                    f"skipped {enrichment.skipped_cells}."
+                )
+                timing_html = (
+                    f'<div class="mf-timing">'
+                    f'⏱ Enrichment: <strong>{enrich_secs:.0f}s</strong> &nbsp;|&nbsp; '
+                    f'Scoring: <strong>{engine_secs:.0f}s</strong> &nbsp;|&nbsp; '
+                    f'Total: <strong>{total_secs:.0f}s ({total_secs/60:.1f} min)</strong>'
+                    f"{'&nbsp; 🎯 Under 3 min!' if total_secs < 180 else ''}"
+                    f'</div>'
+                )
+                with generated_path.open("rb") as f:
+                    generated_bytes = io.BytesIO(f.read())
+                generated_filename = generated_path.name
+                funds_count        = len(funds)
+                categories_count   = len({f.category for f in funds})
+                st.success("Step 3 · Excel ready — download below.")
+                if enrichment_summary:
+                    st.info(enrichment_summary)
+        # ── Download area (persists after rerun) ──────────────────────────
+        if generated_bytes and generated_filename:
+            if timing_html:
+                st.markdown(timing_html, unsafe_allow_html=True)
+            st.markdown(
+                """
+                <div class="mf-metrics">
+                  <div class="mf-metric">
+                    <div class="mf-metric-label">Schemes scored</div>
+                    <div class="mf-metric-value">{funds_count}</div>
+                  </div>
+                  <div class="mf-metric">
+                    <div class="mf-metric-label">Categories</div>
+                    <div class="mf-metric-value">{categories_count}</div>
+                  </div>
+                  <div class="mf-metric">
+                    <div class="mf-metric-label">Output format</div>
+                    <div class="mf-metric-value">Excel (.xlsx)</div>
+                  </div>
+                </div>
+                """.format(
+                    funds_count=funds_count or 0,
+                    categories_count=categories_count or 0,
+                ),
+                unsafe_allow_html=True,
+            )
+            st.markdown(
+                '<div class="mf-download-label">Download the scored workbook:</div>',
+                unsafe_allow_html=True,
+            )
+            st.download_button(
+                label="⬇️  Download processed Excel",
+                data=generated_bytes.getvalue(),
+                file_name=generated_filename,
+                mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+                use_container_width=True,
+            )
+    with tab_about:
+        st.markdown("### What the pipeline does")
+        st.markdown(
+            """
+            | Phase | What happens |
+            |---|---|
+            | **0 — Scheme resolution** | Parallel fuzzy-match of missing AMFI scheme codes (8 threads) |
+            | **1 — NAV engine** | Trailing 3Y risk metrics computed from mfapi NAV history (12 threads) |
+            | **2 — PE/PB web search** | Tavily (primary) or Firecrawl (fallback) — only for missing P/E and P/B |
+            | **3 — Median impute** | Category median fills remaining gaps for young/NA funds |
+            | **4 — Scoring** | Top/Bottom 10 per category, 10-point weighted model |
+            | **5 — Excel export** | Conditional formatting, quartile bands, benchmark rows |
+            **Cache**: NAV history is cached in Neon (production) or SQLite (local) with a 7-day TTL.
+            Second runs are near-instant for cached funds.
+            """
+        )
+    st.markdown("</section>", unsafe_allow_html=True)
+    st.markdown("</div>", unsafe_allow_html=True)
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+pandas>=2.0.0
+openpyxl>=3.1.0
+reportlab>=4.0.0
+matplotlib>=3.7.0
+numpy>=1.24.0
+click>=8.1.0
+streamlit>=1.31.0
+requests>=2.31.0
+python-dateutil>=2.8.2
+fuzzywuzzy>=0.18.0
+python-Levenshtein>=0.21.0
+mftool>=1.0.0
+yfinance>=1.2.0
+beautifulsoup4>=4.14.3
+scipy>=1.17.1
+lxml>=6.0.2
+openai>=1.0.0

src/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Mutual Fund Portfolio Analyzer

src/charts.py ADDED Viewed

	@@ -0,0 +1,314 @@

+"""
+Charts module: generates matplotlib charts for embedded use in PDF reports.
+All functions return a BytesIO buffer containing a PNG image.
+"""
+import io
+import numpy as np
+import matplotlib
+matplotlib.use('Agg')  # non-interactive backend
+import matplotlib.pyplot as plt
+import matplotlib.patches as mpatches
+from matplotlib.figure import Figure
+from typing import Dict, List, Optional
+# Brand colours
+BRAND_BLUE   = "#1F3864"
+BRAND_ACCENT = "#2E75B6"
+GREENS       = ["#2ECC71", "#27AE60", "#1ABC9C", "#16A085", "#52BE80"]
+REDS         = ["#E74C3C", "#C0392B", "#EC7063"]
+PALETTE      = [
+    "#2E75B6", "#E67E22", "#2ECC71", "#E74C3C", "#9B59B6",
+    "#1ABC9C", "#F39C12", "#3498DB", "#D35400", "#27AE60",
+]
+def _buf(fig: Figure) -> io.BytesIO:
+    buf = io.BytesIO()
+    fig.savefig(buf, format='png', bbox_inches='tight', dpi=150)
+    buf.seek(0)
+    plt.close(fig)
+    return buf
+def holdings_pie_chart(holdings_data: Dict[str, float], title: str = "Portfolio Allocation") -> io.BytesIO:
+    """
+    Pie chart of holdings by name → value.
+    holdings_data: {scheme_name: current_value}
+    """
+    labels = list(holdings_data.keys())
+    values = list(holdings_data.values())
+    # Shorten long labels
+    short_labels = [l.split('-')[0].strip()[:22] for l in labels]
+    fig, ax = plt.subplots(figsize=(5, 4))
+    wedges, texts, autotexts = ax.pie(
+        values,
+        labels=None,
+        autopct='%1.1f%%',
+        startangle=140,
+        colors=PALETTE[:len(values)],
+        pctdistance=0.78,
+    )
+    for at in autotexts:
+        at.set_fontsize(7)
+        at.set_color("white")
+    ax.legend(wedges, short_labels, loc="center left", bbox_to_anchor=(1, 0.5),
+              fontsize=7, frameon=False)
+    ax.set_title(title, fontsize=10, fontweight='bold', color=BRAND_BLUE, pad=10)
+    fig.tight_layout()
+    return _buf(fig)
+def sector_bar_chart(sector_data: Dict[str, float], title: str = "Sector Allocation (%)") -> io.BytesIO:
+    """Horizontal bar chart for sector allocation."""
+    if not sector_data:
+        sector_data = {"Data Not Available": 100}
+    sectors = list(sector_data.keys())
+    values  = list(sector_data.values())
+    # Sort descending
+    pairs = sorted(zip(values, sectors), reverse=True)
+    values, sectors = zip(*pairs)
+    fig, ax = plt.subplots(figsize=(5, max(3, len(sectors) * 0.35)))
+    bars = ax.barh(sectors, values, color=BRAND_ACCENT, edgecolor='white', height=0.6)
+    for bar, val in zip(bars, values):
+        ax.text(bar.get_width() + 0.3, bar.get_y() + bar.get_height() / 2,
+                f'{val:.1f}%', va='center', fontsize=7, color='black')
+    ax.set_xlabel("Allocation (%)", fontsize=8, color='gray')
+    ax.set_title(title, fontsize=10, fontweight='bold', color=BRAND_BLUE)
+    ax.set_xlim(0, max(values) * 1.2)
+    ax.invert_yaxis()
+    ax.spines[['top', 'right']].set_visible(False)
+    ax.tick_params(axis='y', labelsize=8)
+    fig.tight_layout()
+    return _buf(fig)
+def market_cap_pie(market_cap_data: Dict[str, float]) -> io.BytesIO:
+    """Pie chart for Large/Mid/Small/Other market cap split."""
+    default = {"Large Cap": 0, "Mid Cap": 0, "Small Cap": 0, "Others": 0}
+    data = {**default, **market_cap_data}
+    data = {k: v for k, v in data.items() if v > 0}
+    colors = {"Large Cap": "#2E75B6", "Mid Cap": "#E67E22",
+              "Small Cap": "#2ECC71", "Others": "#BDC3C7"}
+    labels = list(data.keys())
+    values = list(data.values())
+    clrs   = [colors.get(l, "#95A5A6") for l in labels]
+    fig, ax = plt.subplots(figsize=(4, 3.5))
+    wedges, _, autotexts = ax.pie(
+        values, labels=None, autopct='%1.1f%%',
+        colors=clrs, startangle=90, pctdistance=0.75
+    )
+    for at in autotexts:
+        at.set_fontsize(8)
+        at.set_color("white")
+    ax.legend(wedges, labels, loc="lower center", bbox_to_anchor=(0.5, -0.12),
+              ncol=2, fontsize=8, frameon=False)
+    ax.set_title("Market Cap Allocation", fontsize=10, fontweight='bold', color=BRAND_BLUE)
+    fig.tight_layout()
+    return _buf(fig)
+def holding_vs_benchmark_chart(
+    fund_name: str,
+    cagr_data: Dict[str, Dict[str, Optional[float]]],
+) -> io.BytesIO:
+    """
+    Bar chart comparing fund CAGR vs benchmark across time periods.
+    cagr_data = {
+        '1Y': {'fund': 12.5, 'benchmark': 14.6, 'category': 13.4},
+        '3Y': {...}, '5Y': {...}, '10Y': {...}
+    }
+    """
+    periods = list(cagr_data.keys())
+    fund_vals  = [cagr_data[p].get('fund') or 0 for p in periods]
+    bm_vals    = [cagr_data[p].get('benchmark') or 0 for p in periods]
+    cat_vals   = [cagr_data[p].get('category') or 0 for p in periods]
+    x = np.arange(len(periods))
+    width = 0.25
+    fig, ax = plt.subplots(figsize=(5, 3.5))
+    b1 = ax.bar(x - width, fund_vals, width, label='Fund', color=BRAND_ACCENT, zorder=2)
+    b2 = ax.bar(x,          bm_vals,  width, label='Benchmark', color='#E67E22', zorder=2)
+    b3 = ax.bar(x + width,  cat_vals, width, label='Category', color='#BDC3C7', zorder=2)
+    def label_bars(bars):
+        for bar in bars:
+            h = bar.get_height()
+            if h:
+                ax.text(bar.get_x() + bar.get_width() / 2, h + 0.2,
+                        f'{h:.1f}', ha='center', va='bottom', fontsize=6.5)
+    label_bars(b1); label_bars(b2); label_bars(b3)
+    ax.set_xticks(x)
+    ax.set_xticklabels(periods, fontsize=9)
+    ax.set_ylabel("CAGR (%)", fontsize=8, color='gray')
+    ax.set_title(f"{fund_name[:30]}\nReturns vs Benchmark", fontsize=9, fontweight='bold', color=BRAND_BLUE)
+    ax.legend(fontsize=7, frameon=False)
+    ax.spines[['top', 'right']].set_visible(False)
+    ax.yaxis.grid(True, linestyle='--', alpha=0.5, zorder=0)
+    ax.set_axisbelow(True)
+    fig.tight_layout()
+    return _buf(fig)
+def quartile_analysis_grid(holdings_data: list) -> io.BytesIO:
+    """
+    Quartile Analysis Grid — based on the senior's handwritten sketch.
+    Layout (matching sketch exactly):
+      Columns  : 1Y | 3Y | 5Y | 10Y
+      For each holding, show 3 rows:
+        BM    : Benchmark CAGR value for each period
+        Cat   : Category Average CAGR for each period
+        Scheme: Fund CAGR + Quartile (Q1/Q2/Q3/Q4) — color-coded
+    holdings_data: list of dicts, each with keys:
+      scheme_name, rank_in_category, total_in_category,
+      cagr_1y/_bm/_cat, cagr_3y/_bm/_cat, cagr_5y/_bm/_cat, cagr_10y/_bm/_cat
+    """
+    PERIODS     = ["1Y", "3Y", "5Y", "10Y"]
+    PERIOD_KEYS = ["1y", "3y", "5y", "10y"]
+    ROW_LABELS  = ["BM", "Cat", "Scheme"]
+    Q_COLORS   = {1: "#90EE90", 2: "#BDD7EE", 3: "#FFD580", 4: "#FFB3B3"}
+    HEADER_CLR = "#1F3864"
+    BM_CLR     = "#D6E4F0"
+    CAT_CLR    = "#EBF5FB"
+    def get_quartile(rank, total):
+        if not rank or not total or total == 0:
+            return 4
+        pct = rank / total
+        if pct <= 0.25: return 1
+        if pct <= 0.50: return 2
+        if pct <= 0.75: return 3
+        return 4
+    def fmt(v):
+        if v is None: return "–"
+        try: return f"{float(v):.1f}%"
+        except: return "–"
+    n_holdings = len(holdings_data)
+    rows_per   = 3   # BM, Cat, Scheme
+    n_rows     = n_holdings * rows_per + 1   # +1 for header row
+    n_cols     = 5                            # Label + 4 periods
+    fig_h = max(4.5, 0.5 * n_rows + 1.5)
+    fig, ax = plt.subplots(figsize=(10, fig_h))
+    ax.set_xlim(0, n_cols)
+    ax.set_ylim(0, n_rows)
+    ax.axis('off')
+    def cell(row, col, text, bg, tc="#1F3864", bold=False, fs=8):
+        ax.add_patch(plt.Rectangle(
+            (col, n_rows - row - 1), 1, 1,
+            facecolor=bg, edgecolor="#AAAAAA", linewidth=0.5, zorder=1))
+        ax.text(col + 0.5, n_rows - row - 0.5, text,
+                ha='center', va='center', fontsize=fs,
+                fontweight='bold' if bold else 'normal',
+                color=tc, zorder=2, wrap=True)
+    # Column header row
+    col_widths = [1.5, 1, 1, 1, 0.8]  # proportional, but we draw on a 5-unit grid
+    cell(0, 0, "Scheme / Row", HEADER_CLR, "white", bold=True, fs=7.5)
+    for ci, p in enumerate(PERIODS, 1):
+        cell(0, ci, p, HEADER_CLR, "white", bold=True, fs=10)
+    # Data rows
+    cur = 1
+    for h in holdings_data:
+        rank  = h.get("rank_in_category")
+        total = h.get("total_in_category")
+        q     = get_quartile(rank, total)
+        qc    = Q_COLORS[q]
+        q_lbl = f"Q{q}"
+        name  = str(h.get("scheme_name", ""))[:22]
+        for ri, rl in enumerate(ROW_LABELS):
+            if ri == 0:
+                lbl = f"{name}\n[BM]"
+                bg  = BM_CLR
+            elif ri == 1:
+                lbl = "[Category]"
+                bg  = CAT_CLR
+            else:
+                lbl = f"[Scheme — {q_lbl}]"
+                bg  = qc
+            cell(cur + ri, 0, lbl, bg, bold=(ri == 2), fs=6.5)
+            for ci, pk in enumerate(PERIOD_KEYS, 1):
+                if ri == 0:
+                    v = fmt(h.get(f"cagr_{pk}_bm"))
+                    bg_c = BM_CLR
+                elif ri == 1:
+                    v = fmt(h.get(f"cagr_{pk}_cat"))
+                    bg_c = CAT_CLR
+                else:
+                    fv  = h.get(f"cagr_{pk}")
+                    bmv = h.get(f"cagr_{pk}_bm")
+                    v   = fmt(fv)
+                    bg_c = qc
+                    # Green tick if fund beats benchmark this period
+                    if fv is not None and bmv is not None and float(fv) >= float(bmv):
+                        ax.text(ci + 0.88, n_rows - (cur + ri) - 0.18,
+                                "✓", fontsize=8, color="#006400", va='center', zorder=3)
+                cell(cur + ri, ci, v, bg_c, bold=(ri == 2), fs=8)
+        # Divider between schemes
+        y = n_rows - (cur + rows_per) - 0.02
+        ax.axhline(y=y, xmin=0, xmax=1, color="#555555", linewidth=1.0, zorder=4)
+        cur += rows_per
+    # Legend
+    patches = [mpatches.Patch(facecolor=Q_COLORS[i], edgecolor='#AAAAAA',
+               label=f"Q{i} – {['Top Quartile','Above Avg','Below Avg','Bottom Quartile'][i-1]}")
+               for i in range(1, 5)]
+    ax.legend(handles=patches, loc='lower center',
+              bbox_to_anchor=(0.5, -0.09), ncol=4, fontsize=7.5, frameon=False)
+    ax.set_title("Quartile Analysis — Scheme vs Benchmark & Category Average",
+                 fontsize=10, fontweight='bold', color=HEADER_CLR, pad=10)
+    fig.tight_layout()
+    return _buf(fig)
+def wealth_projection_chart(projection: Dict[int, float], current_value: float) -> io.BytesIO:
+    """Line chart showing projected wealth growth at 12% over years."""
+    years = [0] + list(projection.keys())
+    values = [current_value] + list(projection.values())
+    fig, ax = plt.subplots(figsize=(5, 3))
+    ax.plot(years, values, marker='o', color=BRAND_ACCENT, linewidth=2, markersize=6)
+    for yr, val in zip(years, values):
+        ax.annotate(f'₹{val/1e5:.1f}L', (yr, val),
+                    textcoords="offset points", xytext=(0, 8),
+                    ha='center', fontsize=7.5, color=BRAND_BLUE)
+    ax.fill_between(years, values, alpha=0.15, color=BRAND_ACCENT)
+    ax.set_xticks(years)
+    ax.set_xticklabels([f'Now' if y == 0 else f'{y}Y' for y in years], fontsize=8)
+    ax.set_ylabel("Portfolio Value (₹)", fontsize=8, color='gray')
+    ax.set_title("Wealth Projection @ 12% p.a.", fontsize=10, fontweight='bold', color=BRAND_BLUE)
+    ax.spines[['top', 'right']].set_visible(False)
+    ax.yaxis.grid(True, linestyle='--', alpha=0.4)
+    ax.set_axisbelow(True)
+    fig.tight_layout()
+    return _buf(fig)

src/csv_enrichment.py ADDED Viewed

	@@ -0,0 +1,941 @@

+"""CSV Enrichment — missing-cell filler for mutual fund statistics.
+Fill pipeline (in order):
+  0. SCHEME CODE RESOLUTION  — fuzzy-match missing scheme codes via mfapi
+  1. TRIAGE                  — classify every missing cell
+  2. NAV ENGINE              — compute trailing-3Y metrics from NAV history
+  3. WEB SEARCH (Firecrawl)  — scrape trusted sites for remaining gaps
+  4. CATEGORY MEDIAN         — last-resort statistical imputation
+Fixes vs original:
+  • Phase-label typo in log (Phase 4 → Phase 5 for imputation step)
+  • Unknown launch date → is_young = False  (attempt search, don't silently impute)
+  • _normalize_fund_name uses re.sub to handle multi-hyphen sequences
+  • scheme code resolution runs BEFORE triage so NAV engine fires for more funds
+  • Standard Deviation now included in NAV-computable metrics
+"""
+from __future__ import annotations
+import csv
+import os
+import re
+from dataclasses import dataclass, field
+from datetime import datetime
+from pathlib import Path
+from statistics import median
+from typing import Any
+import requests
+from src.nav_metrics_engine import NavEngineCache, compute_nav_metrics_for_scheme
+from src.scheme_resolver import (
+    resolve_missing_scheme_codes,
+    resolve_scheme_code_for_fund_name,
+)
+# Fund names attempted by NAV engine this session
+_NAV_ATTEMPTED_FUNDS: set[str] = set()
+# ── Constants ────────────────────────────────────────────────────────────────
+MISSING_TOKENS = {"", "-", "na", "n/a", "n/a*", "nan", "none", "null", "—"}
+TARGET_COLUMNS = (
+    "Alpha",
+    "Beta",
+    "Standard Deviation",
+    "Sharpe Ratio",
+    "Volatility",
+    "Mean",
+    "Sortino Ratio",
+    "Up Market Capture\nRatio",
+    "Down Market Capture\nRatio",
+    "Maximum Drawdown",
+    "R-Squared",
+    "Information Ratio",
+    "P/E Ratio",
+    "P/B Ratio",
+)
+# For all of these risk/ratio metrics, a literal numeric 0 is usually a
+# data-quality gap rather than a meaningful "zero risk" value. We therefore
+# treat 0 as missing so that enrichment (NAV engine + web search) can attempt
+# to backfill real numbers.
+ZERO_AS_MISSING_COLUMNS = set(TARGET_COLUMNS)
+# ALL metrics that are equity-specific and should NOT be attempted
+# via NAV engine or web search for debt/liquid/overnight funds.
+# Sharpe, Sortino, Volatility etc. ARE computed by NAV engine for equity
+# but for debt funds they either don't exist or are meaningless.
+EQUITY_ONLY_METRICS = {
+    "Alpha",
+    "Beta",
+    "Standard Deviation",
+    "Sharpe Ratio",
+    "Volatility",
+    "Mean",
+    "Sortino Ratio",
+    "Up Market Capture\nRatio",
+    "Down Market Capture\nRatio",
+    "Maximum Drawdown",
+    "R-Squared",
+    "Information Ratio",
+    "P/E Ratio",
+    "P/B Ratio",
+}
+DEBT_CATEGORIES_PREFIXES = (
+    "Debt:", "Liquid", "Overnight", "Money Market", "Gilt",
+    "Fixed Maturity", "Interval Fund", "FMP",
+)
+MIN_YEARS_FOR_RISK_METRICS = 3
+TRUSTED_DOMAINS = (
+    "valueresearchonline.com",
+    "morningstar.in",
+    "moneycontrol.com",
+    "advisorkhoj.com",
+    "amfiindia.com",
+    "tickertape.in",
+)
+METRIC_ALIASES: dict[str, list[str]] = {
+    "Alpha":                        ["alpha"],
+    "Beta":                         ["beta"],
+    "Standard Deviation":           ["standard deviation", "std dev", "std. dev"],
+    "Sharpe Ratio":                 ["sharpe ratio", "sharpe"],
+    "Volatility":                   ["volatility"],
+    "Mean":                         ["mean", "mean return"],
+    "Sortino Ratio":                ["sortino ratio", "sortino"],
+    "Up Market Capture\nRatio":     ["upside", "up market capture", "upside capture", "up capture"],
+    "Down Market Capture\nRatio":   ["downside", "down market capture", "downside capture", "down capture"],
+    "Maximum Drawdown":             ["maximum drawdown", "max drawdown", "maximum"],
+    "R-Squared":                    ["r-squared", "r squared", "r2", "r²"],
+    "Information Ratio":            ["information ratio", "info ratio"],
+    "P/E Ratio":                    ["p/e ratio", "p/e", "pe ratio", "pe"],
+    "P/B Ratio":                    ["p/b ratio", "p/b", "pb ratio", "pb"],
+}
+# ── Config & Result ──────────────────────────────────────────────────────────
+@dataclass
+class EnrichmentConfig:
+    enabled:           bool             = True
+    max_cells:         int | None       = None
+    min_confidence:    float            = 0.65
+    search_limit:      int              = 5
+    impute_unresolved: bool             = True
+    filter_category:   str | None       = None
+    target_columns:    tuple[str, ...]  = TARGET_COLUMNS
+    trusted_domains:   tuple[str, ...]  = TRUSTED_DOMAINS
+    enable_nav_engine: bool             = True
+    resolve_scheme_codes: bool          = True   # run pre-triage code resolution
+    web_search_pe_pb_only: bool         = False  # limit web search to P/E and P/B only
+@dataclass
+class EnrichmentResult:
+    input_csv_path:    str
+    enriched_csv_path: str
+    scratchpad_path:   str | None       = None
+    examined_cells:    int              = 0
+    updated_cells:     int              = 0
+    imputed_cells:     int              = 0
+    skipped_cells:     int              = 0
+    resolved_codes:    int              = 0      # NEW: how many scheme codes were resolved
+    # Optional breakdowns used by older callers / UIs
+    nav_cells:         int              = 0      # cells filled via NAV engine
+    web_cells:         int              = 0      # cells filled via web search
+    errors:            list[str]        = field(default_factory=list)
+# ── Triage labels ────────────────────────────────────────────────────────────
+TRIAGE_YOUNG         = "YOUNG_FUND"
+TRIAGE_NOT_APPLICABLE = "NOT_APPLICABLE"
+TRIAGE_SEARCHABLE    = "SEARCHABLE"
+# ── Helpers ──────────────────────────────────────────────────────────────────
+def _load_env() -> None:
+    env_path = Path(__file__).resolve().parent.parent / ".env"
+    if not env_path.exists():
+        return
+    for line in env_path.read_text(encoding="utf-8").splitlines():
+        raw = line.strip()
+        if not raw or raw.startswith("#") or "=" not in raw:
+            continue
+        k, v = raw.split("=", 1)
+        os.environ.setdefault(k.strip(), v.strip())
+def _is_missing(val: str | None) -> bool:
+    return (val or "").strip().lower() in MISSING_TOKENS
+def _parse_launch_date(val: str | None) -> datetime | None:
+    if not val:
+        return None
+    for fmt in ("%d-%m-%Y", "%Y-%m-%d", "%d/%m/%Y"):
+        try:
+            return datetime.strptime(val.strip(), fmt)
+        except ValueError:
+            continue
+    return None
+def _is_debt_category(category: str) -> bool:
+    cat = (category or "").strip().lower()
+    for prefix in DEBT_CATEGORIES_PREFIXES:
+        if cat.startswith(prefix.lower()):
+            return True
+    return False
+def _normalize_fund_name(name: str) -> str:
+    # FIX: use re.sub so multi-hyphen runs collapse to a single space
+    return re.sub(r"-+", " ", name).strip()
+def _build_category_medians(
+    rows: list[dict[str, str]], columns: tuple[str, ...]
+) -> dict[str, dict[str, float]]:
+    """Returns {category: {column: median_value}}."""
+    buckets: dict[str, dict[str, list[float]]] = {}
+    for row in rows:
+        cat = row.get("Category", "")
+        if not cat:
+            continue
+        if cat not in buckets:
+            buckets[cat] = {c: [] for c in columns}
+        for col in columns:
+            raw = (row.get(col) or "").strip().replace("%", "").replace(",", "")
+            if raw.lower() in MISSING_TOKENS:
+                continue
+            try:
+                buckets[cat][col].append(float(raw))
+            except ValueError:
+                pass
+    result: dict[str, dict[str, float]] = {}
+    for cat, col_map in buckets.items():
+        result[cat] = {}
+        for col, vals in col_map.items():
+            if vals:
+                result[cat][col] = round(median(vals), 4)
+    return result
+# ── Triage ───────────────────────────────────────────────────────────────────
+@dataclass
+class TriagedCell:
+    row_idx:       int
+    fund_name:     str
+    category:      str
+    column:        str
+    current_value: str
+    label:         str
+    reason:        str
+def _triage_missing_cells(
+    rows: list[dict[str, str]],
+    config: EnrichmentConfig,
+) -> list[TriagedCell]:
+    """Classify every missing cell with reasoning."""
+    today = datetime.now()
+    cells: list[TriagedCell] = []
+    for idx, row in enumerate(rows):
+        fund       = row.get("Fund", "")
+        cat        = row.get("Category", "")
+        launch_str = row.get("Launch Date", "")
+        if config.filter_category and cat != config.filter_category:
+            continue
+        launch_dt = _parse_launch_date(launch_str)
+        if launch_dt is not None:
+            age_years = (today - launch_dt).days / 365.25
+            is_young  = age_years < MIN_YEARS_FOR_RISK_METRICS
+        else:
+            # FIX: unknown date → do NOT silently mark as young; attempt search
+            is_young = False
+        is_debt = _is_debt_category(cat)
+        for col in config.target_columns:
+            raw = (row.get(col) or "").strip()
+            # Base missing check (blank, "-", "N/A", etc.)
+            is_missing_val = _is_missing(raw)
+            # Additionally, for all ZERO_AS_MISSING_COLUMNS, treat an exact
+            # numeric 0 as "missing" so enrichment will try to fill it.
+            if not is_missing_val and col in ZERO_AS_MISSING_COLUMNS:
+                norm = raw.replace("%", "").replace(",", "").strip()
+                try:
+                    if float(norm) == 0.0:
+                        is_missing_val = True
+                except ValueError:
+                    pass
+            if not is_missing_val:
+                continue
+            if is_young:
+                cells.append(TriagedCell(
+                    row_idx=idx, fund_name=fund, category=cat, column=col,
+                    current_value=raw, label=TRIAGE_YOUNG,
+                    reason=(f"Fund launched {launch_str or '(unknown)'}, "
+                            f"<{MIN_YEARS_FOR_RISK_METRICS}yr history — metric not computed yet"),
+                ))
+            else:
+                cells.append(TriagedCell(
+                    row_idx=idx, fund_name=fund, category=cat, column=col,
+                    current_value=raw, label=TRIAGE_SEARCHABLE,
+                    reason=(f"Fund launched {launch_str or '(unknown date)'}, "
+                            f"category '{cat}' — metric should exist, attempting NAV/web"),
+                ))
+    return cells
+# ── Markdown table parser ────────────────────────────────────────────────────
+def _extract_number(text: str) -> float | None:
+    text = text.strip().replace(",", "")
+    if text.lower() in MISSING_TOKENS or text == "—":
+        return None
+    match = re.search(r"-?\d+\.?\d*", text)
+    if match:
+        try:
+            return float(match.group())
+        except ValueError:
+            return None
+    return None
+def _parse_table_row(markdown: str, alias: str) -> float | None:
+    """Extract the first numeric cell after the label in a markdown table row.
+    For a row like: | Alpha | 1.59 | -0.56 | 8.25 |
+    Returns 1.59  — the fund's own column (leftmost numeric value after the label).
+    This is intentional: sites like Morningstar show Fund | Category | Index
+    and we want the fund value, not the category or index value.
+    """
+    pattern = re.compile(
+        r"\|\s*" + re.escape(alias) + r"\s*\|(.+?)(?:\n|$)",
+        re.IGNORECASE,
+    )
+    for m in pattern.finditer(markdown):
+        rest  = m.group(1)
+        cells = [c.strip() for c in rest.split("|")]
+        for cell in cells:
+            val = _extract_number(cell)
+            if val is not None:
+                return val
+    return None
+def _parse_metrics_from_markdown(
+    markdown: str, wanted_metrics: list[str]
+) -> dict[str, float | None]:
+    found: dict[str, float | None] = {}
+    for metric in wanted_metrics:
+        aliases   = METRIC_ALIASES.get(metric, [metric.lower()])
+        best_val: float | None = None
+        for alias in aliases:
+            if alias.lower() not in markdown.lower():
+                continue
+            val = _parse_table_row(markdown, alias)
+            if val is not None:
+                best_val = val
+                break
+        found[metric] = best_val
+    return found
+# ── Web search (Firecrawl) ───────────────────────────────────────────────────
+def _call_tavily_search(query: str, api_key: str, limit: int = 5) -> list[dict]:
+    """Search using Tavily API. Returns list of dicts with 'url' and 'markdown' keys."""
+    try:
+        resp = requests.post(
+            "https://api.tavily.com/search",
+            headers={"Content-Type": "application/json"},
+            json={
+                "api_key": api_key,
+                "query": query,
+                "max_results": limit,
+                "include_raw_content": True,
+                "search_depth": "advanced",
+            },
+            timeout=30,
+        )
+        resp.raise_for_status()
+        data = resp.json()
+        results = []
+        for r in data.get("results", []):
+            results.append({
+                "url": r.get("url", ""),
+                "markdown": r.get("raw_content") or r.get("content", ""),
+            })
+        return results
+    except Exception as exc:
+        print(f"  [tavily] search error: {exc}")
+    return []
+# Keep firecrawl as alias name so _search_fund_metrics calls work unchanged
+_call_firecrawl_search = _call_tavily_search
+def _scrape_url(url: str, api_key: str) -> str:
+    """Fetch page content using Tavily extract API."""
+    try:
+        resp = requests.post(
+            "https://api.tavily.com/extract",
+            headers={"Content-Type": "application/json"},
+            json={"api_key": api_key, "urls": [url]},
+            timeout=30,
+        )
+        resp.raise_for_status()
+        data = resp.json()
+        results = data.get("results", [])
+        if results:
+            return results[0].get("raw_content", "")
+    except Exception as exc:
+        print(f"  [tavily extract] error for {url}: {exc}")
+    return ""
+def _derive_morningstar_risk_url(any_ms_url: str) -> str | None:
+    if "morningstar.in/mutualfunds/" not in any_ms_url:
+        return None
+    for suffix in ("fund-factsheet.aspx", "overview.aspx", "portfolio.aspx",
+                   "performance.aspx", "detailed-portfolio.aspx"):
+        if suffix in any_ms_url:
+            return any_ms_url.replace(suffix, "risk-ratings.aspx")
+    if "risk-ratings.aspx" in any_ms_url:
+        return any_ms_url
+    return None
+def _derive_morningstar_portfolio_url(any_ms_url: str) -> str | None:
+    """Derive the Morningstar portfolio page (for P/E and P/B)."""
+    if "morningstar.in/mutualfunds/" not in any_ms_url:
+        return None
+    return re.sub(
+        r"(fund-factsheet|overview|risk-ratings|performance|detailed-portfolio)\.aspx",
+        "portfolio.aspx",
+        any_ms_url,
+    )
+def _search_fund_metrics(
+    fund_name:       str,
+    missing_metrics: list[str],
+    config:          EnrichmentConfig,
+    firecrawl_key:   str,
+) -> tuple[dict[str, float | None], list[str]]:
+    from urllib.parse import urlparse
+    readable = _normalize_fund_name(fund_name)
+    query    = f"{readable} risk rating alpha beta sharpe morningstar"
+    print(f"  [search] query: {query[:80]}")
+    results = _call_firecrawl_search(query, firecrawl_key, config.search_limit)
+    if not results:
+        print("  [search] no results")
+        return {m: None for m in missing_metrics}, []
+    trusted, other = [], []
+    for r in results:
+        url    = r.get("url", "")
+        domain = urlparse(url).netloc.lower().replace("www.", "")
+        (trusted if any(td in domain for td in config.trusted_domains) else other).append(r)
+    use = (trusted + other)[:3]
+    source_urls = [r.get("url", "") for r in use]
+    print(f"  [search] using {len(use)} sources: {[urlparse(u).netloc for u in source_urls]}")
+    combined = ""
+    for r in use:
+        md = r.get("markdown", "")
+        if md:
+            combined += f"\n\n--- {r.get('url', '')} ---\n{md}"
+    # Morningstar: scrape risk-ratings page if not already in results
+    ms_risk_url = None
+    for r in use:
+        ms_risk_url = _derive_morningstar_risk_url(r.get("url", ""))
+        if ms_risk_url:
+            break
+    if ms_risk_url and "risk-ratings" not in " ".join(source_urls):
+        print(f"  [scrape] Morningstar risk page: {ms_risk_url}")
+        risk_md = _scrape_url(ms_risk_url, firecrawl_key)
+        if risk_md:
+            combined += f"\n\n--- {ms_risk_url} ---\n{risk_md}"
+            source_urls.append(ms_risk_url)
+    # Morningstar: scrape portfolio page for P/E and P/B
+    pe_pb_needed = {"P/E Ratio", "P/B Ratio"} & set(missing_metrics)
+    if pe_pb_needed and ms_risk_url:
+        ms_port_url = _derive_morningstar_portfolio_url(ms_risk_url)
+        if ms_port_url and ms_port_url not in source_urls:
+            print(f"  [scrape] Morningstar portfolio page: {ms_port_url}")
+            port_md = _scrape_url(ms_port_url, firecrawl_key)
+            if port_md:
+                combined += f"\n\n--- {ms_port_url} ---\n{port_md}"
+                source_urls.append(ms_port_url)
+    # If we still have no markdown content, or if later we still miss
+    # metrics, we'll do a second pass focused on ValueResearch.
+    if not combined.strip():
+        print("  [search] no markdown from initial sources; retrying via valueresearchonline…")
+        vr_query = f"{readable} {' '.join(missing_metrics)} valueresearchonline"
+        vr_results = _call_firecrawl_search(vr_query, firecrawl_key, config.search_limit)
+        if vr_results:
+            vr_combined = ""
+            for r in vr_results:
+                url = r.get("url", "")
+                domain = urlparse(url).netloc.lower().replace("www.", "")
+                if "valueresearchonline.com" not in domain:
+                    continue
+                md = r.get("markdown", "")
+                if md:
+                    vr_combined += f"\n\n--- {url} ---\n{md}"
+                    source_urls.append(url)
+            combined = vr_combined
+    if not combined.strip():
+        print("  [search] no markdown content after ValueResearch retry")
+        return {m: None for m in missing_metrics}, source_urls
+    found = _parse_metrics_from_markdown(combined, missing_metrics)
+    for m, v in found.items():
+        print(f"  [parsed] {m} = {v if v is not None else 'NOT FOUND'}")
+    return found, source_urls
+# ── Scratchpad ───────────────────────────────────────────────────────────────
+def _write_scratchpad(
+    path:          Path,
+    triaged:       list[TriagedCell],
+    resolved_codes: dict[str, str],
+    nav_results:   dict[str, dict[str, float | None]],
+    web_results:   dict[str, dict[str, float | None]],
+    web_sources:   dict[str, list[str]],
+    medians_used:  list[tuple[str, str, float]],
+    nav_filled:    list[tuple[str, str, float]],
+    web_filled:    list[tuple[str, str, float]],
+) -> None:
+    lines = [
+        "=" * 70,
+        "ENRICHMENT SCRATCHPAD",
+        f"Generated: {datetime.now().isoformat()}",
+        "=" * 70, "",
+    ]
+    if resolved_codes:
+        lines += ["-" * 70, f"SCHEME CODES RESOLVED ({len(resolved_codes)})", "-" * 70]
+        for fund, code in resolved_codes.items():
+            lines.append(f"  {fund[:60]:60s} → {code}")
+        lines.append("")
+    young      = [c for c in triaged if c.label == TRIAGE_YOUNG]
+    na         = [c for c in triaged if c.label == TRIAGE_NOT_APPLICABLE]
+    searchable = [c for c in triaged if c.label == TRIAGE_SEARCHABLE]
+    lines += [
+        f"TOTAL MISSING CELLS: {len(triaged)}",
+        f"  YOUNG_FUND (auto-impute):     {len(young)}",
+        f"  NOT_APPLICABLE (auto-impute): {len(na)}",
+        f"  SEARCHABLE (nav/web):         {len(searchable)}",
+        "",
+        "-" * 70, "TRIAGE DECISIONS", "-" * 70,
+    ]
+    for c in triaged:
+        lines.append(f"  [{c.label:16s}] {c.fund_name} :: {c.column}")
+        lines.append(f"                    Reason: {c.reason}")
+    lines.append("")
+    if nav_results:
+        lines += ["-" * 70, "NAV ENGINE RESULTS (TRAILING 3Y)", "-" * 70]
+        for fund_key, metrics in nav_results.items():
+            lines.append(f"  Fund: {fund_key}")
+            for metric, val in metrics.items():
+                lines.append(f"    {metric}: {'FOUND = ' + str(val) if val is not None else 'NOT_FOUND'}")
+            lines.append("")
+    if web_results:
+        lines += ["-" * 70, "WEB SEARCH RESULTS", "-" * 70]
+        for fund_key, metrics in web_results.items():
+            lines.append(f"  Fund: {fund_key}")
+            for s in web_sources.get(fund_key, []):
+                lines.append(f"    Source: {s}")
+            for metric, val in metrics.items():
+                lines.append(f"    {metric}: {'FOUND = ' + str(val) if val is not None else 'NOT_FOUND'}")
+            lines.append("")
+    for section_label, items in [
+        (f"NAV-FILLED VALUES ({len(nav_filled)})",            nav_filled),
+        (f"WEB-FILLED VALUES ({len(web_filled)})",            web_filled),
+        (f"CATEGORY-MEDIAN IMPUTED ({len(medians_used)})",    medians_used),
+    ]:
+        if items:
+            lines += ["-" * 70, section_label, "-" * 70]
+            for fund, col, val in items:
+                lines.append(f"  {fund} :: {col} = {val}")
+            lines.append("")
+    lines += ["=" * 70, "END OF SCRATCHPAD", "=" * 70]
+    path.write_text("\n".join(lines), encoding="utf-8")
+# ── Main entry point ─────────────────────────────────────────────────────────
+def enrich_csv(
+    csv_path: str,
+    config:   EnrichmentConfig | None = None,
+) -> EnrichmentResult:
+    """Parse CSV → resolve codes → triage → NAV engine → web fallback → median impute → write.
+    (Previously named enrich_csv_with_firecrawl_and_kimi; renamed for clarity.)
+    """
+    if config is None:
+        config = EnrichmentConfig()
+    _load_env()
+    src    = Path(csv_path)
+    result = EnrichmentResult(input_csv_path=csv_path, enriched_csv_path=csv_path)
+    if not config.enabled or not src.exists():
+        return result
+    with open(src, encoding="utf-8-sig", newline="") as f:
+        reader     = csv.DictReader(f)
+        fieldnames = list(reader.fieldnames or [])
+        rows       = list(reader)
+    if not rows:
+        return result
+    # ── Phase 0: Scheme Code Resolution ─────────────────────────────────
+    resolved_codes: dict[str, str] = {}
+    if config.resolve_scheme_codes:
+        print("[enrichment] Phase 0: Resolving missing scheme codes…")
+        rows, resolved_codes = resolve_missing_scheme_codes(rows, verbose=True)
+        result.resolved_codes = len(resolved_codes)
+    # ── Phase 1: Triage ──────────────────────────────────────────────────
+    print("[enrichment] Phase 1: Triage — classifying missing cells…")
+    triaged = _triage_missing_cells(rows, config)
+    result.examined_cells = len(triaged)
+    if not triaged:
+        print("[enrichment] No missing cells found.")
+        _write_output(src, rows, fieldnames, result)
+        return result
+    searchable = [c for c in triaged if c.label == TRIAGE_SEARCHABLE]
+    imputable  = [c for c in triaged if c.label != TRIAGE_SEARCHABLE]
+    print(f"[enrichment] {len(triaged)} missing cells: "
+          f"{len(searchable)} SEARCHABLE, {len(imputable)} auto-impute")
+    if config.max_cells is not None:
+        searchable = searchable[:config.max_cells]
+    # ── Phase 2: Category medians ────────────────────────────────────────
+    print("[enrichment] Phase 2: Computing category medians…")
+    cat_medians = _build_category_medians(rows, config.target_columns)
+    # ── Phase 3: NAV engine ──────────────────────────────────────────────
+    nav_results: dict[str, dict[str, float | None]] = {}
+    nav_filled:  list[tuple[str, str, float]]        = []
+    if searchable and config.enable_nav_engine:
+        print("[enrichment] Phase 3: NAV engine — computing trailing 3Y metrics…")
+        nav_cache   = NavEngineCache()
+        # All funds with missing cells go through NAV engine — including debt/liquid.
+        # Debt funds can have valid Sharpe, Mean, Volatility etc. from their NAV history.
+        searchable_for_nav = searchable
+        row_groups: dict[int, list[TriagedCell]] = {}
+        for cell in searchable_for_nav:
+            row_groups.setdefault(cell.row_idx, []).append(cell)
+        total_rows = len(row_groups)
+        processed_count = 0
+        nav_lock = __import__("threading").Lock()
+        NAV_WORKERS = 20  # mfapi is stateless REST — scales well beyond 12
+        # ── Pre-warm: bulk load NAV + benchmarks before workers touch network ──
+        # Step 1: Pull all valid scheme codes and unique benchmarks from rows
+        from src.nav_metrics_engine import _bulk_preload_cache, _prewarm_benchmarks
+        _scheme_codes = [
+            (rows[ri].get("Scheme Code") or "").strip()
+            for ri in row_groups
+            if (rows[ri].get("Scheme Code") or "").strip().isdigit()
+        ]
+        _bench_tickers_raw = [
+            rows[ri].get("Benchmark Type", "") for ri in row_groups
+        ]
+        # Step 2: Resolve benchmark type → ticker (same logic as nav engine)
+        from src.nav_metrics_engine import resolve_benchmark_ticker
+        _bench_tickers = list(dict.fromkeys(
+            resolve_benchmark_ticker(b) for b in _bench_tickers_raw if b
+        ))
+        # Step 3: Bulk load from Neon in 1 SQL query (nav + bench keys)
+        _bulk_preload_cache(_scheme_codes, _bench_tickers)
+        # Step 4: Download any cold benchmark tickers in parallel NOW,
+        #         before workers start — eliminates yfinance contention
+        _prewarm_benchmarks(_bench_tickers)
+        def _process_one_fund(args):
+            row_idx, cells = args
+            row            = rows[row_idx]
+            fund_name      = row.get("Fund", "")
+            scheme_code    = (row.get("Scheme Code") or "").strip()
+            benchmark_type = row.get("Benchmark Type", "")
+            needed_metrics = [c.column for c in cells]
+            if not scheme_code:
+                return fund_name, {}, cells
+            metrics, skip = compute_nav_metrics_for_scheme(
+                scheme_code=scheme_code,
+                benchmark_type=benchmark_type,
+                needed_metrics=needed_metrics,
+                cache=nav_cache,
+            )
+            joined_reasons = " | ".join(skip.values()).lower()
+            should_refresh_code = (
+                "returned no nav history" in joined_reasons
+                or "nav history is stale" in joined_reasons
+            )
+            if should_refresh_code:
+                refreshed_code, _ = resolve_scheme_code_for_fund_name(fund_name)
+                if refreshed_code and refreshed_code != scheme_code:
+                    row["Scheme Code"] = refreshed_code
+                    metrics, skip = compute_nav_metrics_for_scheme(
+                        scheme_code=refreshed_code,
+                        benchmark_type=benchmark_type,
+                        needed_metrics=needed_metrics,
+                        cache=nav_cache,
+                    )
+            return fund_name, metrics, cells
+        from concurrent.futures import ThreadPoolExecutor, as_completed
+        work_items = list(row_groups.items())
+        with ThreadPoolExecutor(max_workers=NAV_WORKERS) as executor:
+            futures = {executor.submit(_process_one_fund, item): item for item in work_items}
+            for fut in as_completed(futures):
+                fund_name, metrics, cells = fut.result()
+                with nav_lock:
+                    processed_count += 1
+                    nav_results[fund_name] = metrics
+                    for cell in cells:
+                        val = metrics.get(cell.column)
+                        if val is not None:
+                            rows[cell.row_idx][cell.column] = str(round(float(val), 4))
+                            result.updated_cells += 1
+                            nav_filled.append((fund_name, cell.column, float(val)))
+                    # Only mark as attempted if MDD was actually filled —
+                    # drawdown_zero_fix should still retry funds where MDD came back None
+                    if metrics.get("Maximum Drawdown") is not None:
+                        _NAV_ATTEMPTED_FUNDS.add(fund_name)
+                    if processed_count % 20 == 0 or processed_count == total_rows:
+                        print(f"  [nav] {processed_count}/{total_rows} funds processed…")
+        # Keep only still-missing searchable cells for web phase
+        searchable = [c for c in searchable if _is_missing(rows[c.row_idx].get(c.column, ""))]
+        print(f"[enrichment] NAV phase resolved {len(nav_filled)} cells; "
+              f"{len(searchable)} remain for web search")
+        result.nav_cells = len(nav_filled)
+    # ── Phase 4: Web search ──────────────────────────────────────────────
+    web_results: dict[str, dict[str, float | None]] = {}
+    web_sources: dict[str, list[str]]               = {}
+    web_filled:  list[tuple[str, str, float]]        = []
+    firecrawl_key = os.environ.get("TAVILY_API_KEY", "")
+    if searchable and firecrawl_key:
+        fund_groups: dict[str, list[TriagedCell]] = {}
+        for cell in searchable:
+            fund_groups.setdefault(cell.fund_name, []).append(cell)
+        print(f"[enrichment] Phase 4: Web search — {len(searchable)} cells "
+              f"across {len(fund_groups)} funds")
+        # ── Pre-impute non-PE/PB cells if pe_pb_only mode ────────────────
+        # Do this before the parallel search so workers only handle PE/PB
+        web_search_groups: dict[str, list[TriagedCell]] = {}
+        for fund_name, cells in fund_groups.items():
+            if config.web_search_pe_pb_only:
+                cells_to_impute = [c for c in cells if c.column not in ("P/E Ratio", "P/B Ratio")]
+                for cell in cells_to_impute:
+                    med = cat_medians.get(cell.category, {}).get(cell.column)
+                    if med is not None and config.impute_unresolved:
+                        rows[cell.row_idx][cell.column] = str(med)
+                        result.updated_cells += 1
+                        result.imputed_cells += 1
+                cells_for_web = [c for c in cells if c.column in ("P/E Ratio", "P/B Ratio")]
+            else:
+                cells_for_web = cells
+            if cells_for_web:
+                web_search_groups[fund_name] = cells_for_web
+        WEB_WORKERS = 10  # Tavily allows concurrent requests; stay conservative
+        web_lock = __import__("threading").Lock()
+        web_done = [0]
+        total_web = len(web_search_groups)
+        def _search_one_fund(args):
+            fund_name, cells = args
+            missing_metrics = [c.column for c in cells]
+            found, sources = _search_fund_metrics(fund_name, missing_metrics, config, firecrawl_key)
+            return fund_name, cells, found, sources
+        from concurrent.futures import ThreadPoolExecutor as _WebTPE, as_completed as _web_as_completed
+        with _WebTPE(max_workers=WEB_WORKERS) as web_executor:
+            futures = {
+                web_executor.submit(_search_one_fund, item): item
+                for item in web_search_groups.items()
+            }
+            for fut in _web_as_completed(futures):
+                fund_name, cells, found, sources = fut.result()
+                with web_lock:
+                    web_done[0] += 1
+                    web_results[fund_name] = found
+                    web_sources[fund_name] = sources
+                    print(f"\n[{web_done[0]}/{total_web}] {fund_name}")
+                    for cell in cells:
+                        val = found.get(cell.column)
+                        if val is not None:
+                            rows[cell.row_idx][cell.column] = str(val)
+                            result.updated_cells += 1
+                            web_filled.append((fund_name, cell.column, val))
+                            print(f"  -> {cell.column} = {val} (web)")
+                        else:
+                            med = cat_medians.get(cell.category, {}).get(cell.column)
+                            if med is not None and config.impute_unresolved:
+                                rows[cell.row_idx][cell.column] = str(med)
+                                result.imputed_cells += 1
+                                print(f"  ~> {cell.column} = {med} (median)")
+                            else:
+                                result.skipped_cells += 1
+                                print(f"  x> {cell.column} — not found, no median")
+    elif searchable and not firecrawl_key:
+        print("[enrichment] WARNING: TAVILY_API_KEY not set — skipping web search, using medians")
+        result.errors.append("TAVILY_API_KEY not set")
+        for cell in searchable:
+            med = cat_medians.get(cell.category, {}).get(cell.column)
+            if med is not None and config.impute_unresolved:
+                rows[cell.row_idx][cell.column] = str(med)
+                result.imputed_cells += 1
+            else:
+                result.skipped_cells += 1
+    # ── Phase 5: Impute non-searchable (YOUNG / NOT_APPLICABLE) cells ────
+    # FIX: was incorrectly labelled "Phase 4" in log
+    medians_used: list[tuple[str, str, float]] = []
+    if imputable and config.impute_unresolved:
+        print(f"\n[enrichment] Phase 5: Imputing {len(imputable)} non-searchable cells…")
+        for cell in imputable:
+            med = cat_medians.get(cell.category, {}).get(cell.column)
+            if med is not None:
+                rows[cell.row_idx][cell.column] = str(med)
+                result.imputed_cells += 1
+                medians_used.append((cell.fund_name, cell.column, med))
+            else:
+                result.skipped_cells += 1
+    elif imputable:
+        result.skipped_cells += len(imputable)
+    # Record how many cells came from web search
+    result.web_cells = len(web_filled)
+    # ── Phase 6: Write enriched CSV ──────────────────────────────────────
+    _write_output(src, rows, fieldnames, result)
+    # ── Phase 7: Scratchpad ──────────────────────────────────────────────
+    scratch_dir = Path("scratchpad")
+    scratch_dir.mkdir(exist_ok=True)
+    stamp       = datetime.now().strftime("%Y%m%d_%H%M%S")
+    scratch_path = scratch_dir / f"enrichment_{stamp}.txt"
+    _write_scratchpad(
+        scratch_path, triaged, resolved_codes,
+        nav_results, web_results, web_sources,
+        medians_used, nav_filled, web_filled,
+    )
+    result.scratchpad_path = str(scratch_path)
+    print(f"\n[enrichment] DONE — nav_filled={len(nav_filled)} web_filled={len(web_filled)} "
+          f"imputed={result.imputed_cells} skipped={result.skipped_cells}")
+    print(f"[enrichment] Enriched CSV  : {result.enriched_csv_path}")
+    print(f"[enrichment] Scratchpad    : {scratch_path}")
+    return result
+def _write_output(
+    src:        Path,
+    rows:       list[dict[str, str]],
+    fieldnames: list[str],
+    result:     EnrichmentResult,
+) -> None:
+    out_dir  = src.parent / "enriched"
+    out_dir.mkdir(exist_ok=True)
+    out_path = out_dir / f"enriched_{src.name}"
+    with open(out_path, "w", encoding="utf-8-sig", newline="") as f:
+        writer = csv.DictWriter(f, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(rows)
+    result.enriched_csv_path = str(out_path)
+# Backward-compat alias (old name used in streamlit_app and run_enrichment_pipeline)
+enrich_csv_with_firecrawl_and_kimi = enrich_csv
+# ── Single metric lookup (for Streamlit UI) ──────────────────────────────────
+def lookup_fund_metric_value(
+    fund_name:   str,
+    column_name: str,
+    scheme_code: str              = "",
+    config:      EnrichmentConfig | None = None,
+) -> dict[str, Any]:
+    _load_env()
+    if config is None:
+        config = EnrichmentConfig()
+    firecrawl_key = os.environ.get("TAVILY_API_KEY", "")
+    if not firecrawl_key:
+        return {"status": "error", "message": "TAVILY_API_KEY not set"}
+    found, sources = _search_fund_metrics(fund_name, [column_name], config, firecrawl_key)
+    val = found.get(column_name)
+    if val is not None:
+        return {"status": "found",     "fund": fund_name, "metric": column_name,
+                "value": val, "sources": sources}
+    return  {"status": "not_found",  "fund": fund_name, "metric": column_name,
+             "value": None, "sources": sources}

src/data_engine.py ADDED Viewed

	@@ -0,0 +1,1210 @@

+"""
+Data Engine: -d mode
+Reads the fund-stats CSV and exports processed Excel matching Processed data.xlsx format.
+Layout (matching target XLSX):
+  - One combined sheet with all fund categories
+  - Header row (light green #C9FFCC)
+  - For each category:
+    - Category header row (no fill, bold text)
+    - BM Index row (Col A: #BAEAEE, CAGR cols F,G,H,I: #C4EFFF)
+    - Category Average row (Col A: #BAEAEE, CAGR cols F,G,H,I + P/E,P/B cols L,M: #C4EFFF)
+  - Fund rows sorted by score (weightage) descending, strictly largest to lowest
+  - Weightage scoring: Compare fund CAGR vs Category Average (NOT BM Index)
+    - 1Y CAGR beats Cat Avg: 2 pts
+    - 3Y CAGR beats Cat Avg: 3 pts
+    - 5Y CAGR beats Cat Avg: 4 pts
+    - 10Y CAGR beats Cat Avg: 5 pts
+    - Max possible: 14 pts
+  - Yellow background (#F1FFB6) on Col A only if Weightage >= 8
+  - NO green/red font coloring on CAGR cells (plain black only)
+  - Category Average row Col B is EMPTY (no benchmark type)
+"""
+import csv
+import math
+import re
+from datetime import datetime
+from pathlib import Path
+from typing import List, Optional, Tuple, Dict, Any
+from openpyxl import Workbook
+from openpyxl.styles import PatternFill, Font, Alignment, Border, Side
+from openpyxl.utils import get_column_letter
+from openpyxl.formatting.rule import Rule, CellIsRule, FormulaRule
+from openpyxl.styles.differential import DifferentialStyle
+from src.models import Fund
+from src.weightage import compute_scores, drawdown_zero_fix
+from src.reference_data import extract_reference_data, get_fund_weightage_from_reference, DEFAULT_REFERENCE_PATH
+# ─── Color palette ─────────────────────────────────────────────────────────────────
+FILL_HEADER        = PatternFill(start_color="C9FFCC", end_color="C9FFCC", fill_type="solid")
+FILL_BM_ROW        = PatternFill(start_color="BAEAEE", end_color="BAEAEE", fill_type="solid")
+FILL_BM_CAGR       = PatternFill(start_color="C4EFFF", end_color="C4EFFF", fill_type="solid")
+FILL_CAT_AVG       = PatternFill(start_color="BAEAEE", end_color="BAEAEE", fill_type="solid")
+FILL_CAT_CAGR      = PatternFill(start_color="C4EFFF", end_color="C4EFFF", fill_type="solid")
+FILL_WEIGHTED_YELLOW = PatternFill(start_color="FFFF00", end_color="FFFF00", fill_type="solid")
+FILL_WEIGHTED_GREEN  = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
+FILL_WHITE         = PatternFill(fill_type=None)
+FILL_WEIGHT_REF    = PatternFill(start_color="EDEDED", end_color="EDEDED", fill_type="solid")  # light grey weight row
+# Quartile fills
+FILL_QUARTILE_GREEN  = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
+FILL_QUARTILE_YELLOW = PatternFill(start_color="FFFF00", end_color="FFFF00", fill_type="solid")
+FILL_QUARTILE_ORANGE = PatternFill(start_color="FFC000", end_color="FFC000", fill_type="solid")
+FILL_QUARTILE_RED    = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
+# ── Fonts — Arial for identical rendering on macOS + Windows ─────────────────
+# openpyxl falls back gracefully when Arial is absent, but both platforms ship it.
+FONT_DEFAULT      = Font(name="Arial", size=8, color="000000")
+FONT_DEFAULT_BOLD = Font(name="Arial", size=8, bold=True, color="000000")
+FONT_HEADER       = Font(name="Arial", size=8, bold=True, color="000000")
+FONT_CAT_HEADER   = Font(name="Arial", size=10, bold=True, color="000000")
+FONT_WEIGHT_REF   = Font(name="Arial", size=7, italic=True, color="666666")  # subtle grey label
+THIN        = Side(border_style="thin", color="CCCCCC")
+BORDER_THIN = Border(left=THIN, right=THIN, top=THIN, bottom=THIN)
+# ─── Weight reference row data (advisor-revised March 2026) ──────────────────
+# Shown beneath every category's column-header row as a read-only reference.
+# Must match src/weightage.py WEIGHTS exactly.
+# ↑ = Top-10 (higher better), ↓ = Bottom-10 (lower better)
+WEIGHT_REF_ROW: Dict[str, str] = {
+    "ter":          "0.15 ↓",
+    "turnover":     "0.10 ↓",
+    "cagr_3y":      "0.40 ↑",
+    "cagr_5y":      "0.60 ↑",
+    "cagr_10y":     "0.75 ↑",
+    "pe_ratio":     "0.15 ↓",
+    "alpha":        "1.00 ↑*",   # * = Light Red if α < 1
+    "std_dev":      "1.00 ↓",
+    "sharpe":       "1.20 ↑",
+    "sortino":      "1.30 ↑",
+    "down_capture": "1.00 ↓",
+    "max_drawdown": "1.35 ↑",
+    "info_ratio":   "1.00 ↑*",  # * = Light Red if IR < 0
+    "weightage":    "10.00",
+}
+# ─── Column definitions ───────────────────────────────────────────────────────
+# Tuple: (header_label, fund_attr, col_width, is_pct, decimal_places)
+# Widths are calibrated so wrap_text = True keeps cells readable without
+# the advisor needing to manually drag columns on either platform.
+XLSX_COLUMNS = [
+    ("Fund",                  "name",           40, False, 0),   # A — wide: long fund names
+    ("Benchmark Type",        "benchmark",      22, False, 0),   # B
+    ("TER",                   "ter",             9, True,  4),   # C
+    ("Turn over (%)",         "turnover",       11, True,  2),   # D
+    ("Mean",                  "mean",            9, False, 2),   # E
+    ("1 Year CAGR",           "cagr_1y",        10, False, 2),   # F
+    ("3 Years CAGR",          "cagr_3y",        10, False, 2),   # G
+    ("5 Years CAGR",          "cagr_5y",        10, False, 2),   # H
+    ("10 Years CAGR",         "cagr_10y",       11, False, 2),   # I
+    ("CAGR Since Inception",  "cagr_inception", 14, False, 2),   # J
+    ("NAV",                   "nav",            10, False, 2),   # K
+    ("P/E Ratio",             "pe_ratio",       10, False, 2),   # L
+    ("P/B Ratio",             "pb_ratio",       10, False, 2),   # M
+    ("Alpha",                 "alpha",          10, False, 2),   # N
+    ("Volatility",            "volatility",     10, False, 2),   # O
+    ("Beta",                  "beta",            9, False, 2),   # P
+    ("Standard Deviation",    "std_dev",        14, False, 2),   # Q
+    ("Sharpe Ratio",          "sharpe",         11, False, 2),   # R
+    ("Sortino Ratio",         "sortino",        11, False, 2),   # S
+    ("Up Market Capture",     "up_capture",     14, False, 2),   # T
+    ("Down Market Capture",   "down_capture",   16, False, 2),   # U
+    ("Maximum Drawdown",      "max_drawdown",   15, False, 2),   # V
+    ("R-Squared",             "r_squared",      11, False, 2),   # W
+    ("Information Ratio",     "info_ratio",     14, False, 2),   # X
+    ("Total Assets (in Cr)",  "aum",            16, False, 1),   # Y
+    ("Weightage",             "weightage",      11, False, 3),   # Z — 3dp for precision
+]
+NUM_COLS = len(XLSX_COLUMNS)
+def _to_float(val) -> Optional[float]:
+    """Safely convert raw CSV value to float."""
+    if val is None:
+        return None
+    s = str(val).strip().replace('%', '').replace(',', '')
+    if s in ('', '-', 'N/A*', 'N/A', 'nan', 'None'):
+        return None
+    try:
+        return float(s)
+    except ValueError:
+        return None
+def _parse_ter(val) -> Optional[float]:
+    """Parse TER value - CSV has percentage format like '1.40%', convert to decimal."""
+    if val is None:
+        return None
+    # Check if percentage BEFORE stripping
+    is_pct = '%' in str(val)
+    s = str(val).strip().replace('%', '').replace(',', '')
+    if s in ('', '-', 'N/A*', 'N/A', 'nan', 'None'):
+        return None
+    try:
+        v = float(s)
+        # Convert percentage to decimal (e.g., 1.40 -> 0.014)
+        if is_pct:
+            v = v / 100
+        return v
+    except ValueError:
+        return None
+def _parse_turnover(val) -> Optional[float]:
+    """Parse turnover value - CSV has percentage format like '20%', convert to decimal."""
+    if val is None:
+        return None
+    # Check if percentage BEFORE stripping
+    is_pct = '%' in str(val)
+    s = str(val).strip().replace('%', '').replace(',', '')
+    if s in ('', '-', 'N/A*', 'N/A', 'nan', 'None'):
+        return None
+    try:
+        v = float(s)
+        # Convert percentage to decimal (e.g., 20 -> 0.20)
+        if is_pct:
+            v = v / 100
+        return v
+    except ValueError:
+        return None
+def _parse_launch_date(val) -> Optional[datetime]:
+    """Parse launch date from CSV into datetime."""
+    if val is None:
+        return None
+    s = str(val).strip()
+    if not s or s in ("-", "N/A", "N/A*"):
+        return None
+    for fmt in ("%d-%m-%Y", "%Y-%m-%d", "%d/%m/%Y"):
+        try:
+            return datetime.strptime(s, fmt)
+        except ValueError:
+            continue
+    return None
+# ─── Auto-calculation for incomplete sections ────────────────────────────────────
+def _calculate_category_averages(funds: List[Fund]) -> Dict[str, Dict[str, Any]]:
+    """
+    Calculate category averages from fund-level category CAGR values.
+    For categories without official data, extract category average values from fund rows.
+    Uses the FIRST fund's category average value for each period.
+    """
+    categories: Dict[str, List[Fund]] = {}
+    # Group funds by category
+    for fund in funds:
+        if fund.category not in categories:
+            categories[fund.category] = []
+        categories[fund.category].append(fund)
+    cat_avg_data: Dict[str, Dict[str, Any]] = {}
+    for cat_name, cat_funds in categories.items():
+        if not cat_funds:
+            continue
+        # Use the FIRST fund's category average values
+        # This matches the CSV structure where all funds should have the same category average
+        first_fund = cat_funds[0]
+        cat_avg_data[cat_name] = {
+            'cagr_1y': first_fund.cagr_1y_cat if first_fund.cagr_1y_cat and first_fund.cagr_1y_cat != 0 else None,
+            'cagr_3y': first_fund.cagr_3y_cat if first_fund.cagr_3y_cat and first_fund.cagr_3y_cat != 0 else None,
+            'cagr_5y': first_fund.cagr_5y_cat if first_fund.cagr_5y_cat and first_fund.cagr_5y_cat != 0 else None,
+            'cagr_10y': first_fund.cagr_10y_cat if first_fund.cagr_10y_cat and first_fund.cagr_10y_cat != 0 else None,
+            'pe_ratio': None,
+            'pb_ratio': None,
+            'is_calculated': True  # Flag to indicate this is calculated from fund data
+        }
+    return cat_avg_data
+def _calculate_benchmark_index(funds: List[Fund]) -> Dict[str, Dict[str, Any]]:
+    """
+    Calculate BM Index from fund-level benchmark CAGR values.
+    For categories without a BM Index row in CSV, extract benchmark values from fund rows.
+    Uses the FIRST fund's benchmark value for each period.
+    """
+    categories: Dict[str, List[Fund]] = {}
+    # Group funds by category
+    for fund in funds:
+        if fund.category not in categories:
+            categories[fund.category] = []
+        categories[fund.category].append(fund)
+    bm_data: Dict[str, Dict[str, Any]] = {}
+    for cat_name, cat_funds in categories.items():
+        if not cat_funds:
+            continue
+        # Use the FIRST fund's benchmark values
+        # This matches the CSV structure where we take the first fund's data
+        first_fund = cat_funds[0]
+        bm_data[cat_name] = {
+            'cagr_1y': first_fund.cagr_1y_bm if first_fund.cagr_1y_bm is not None else None,
+            'cagr_3y': first_fund.cagr_3y_bm if first_fund.cagr_3y_bm is not None else None,
+            'cagr_5y': first_fund.cagr_5y_bm if first_fund.cagr_5y_bm is not None else None,
+            'cagr_10y': first_fund.cagr_10y_bm if first_fund.cagr_10y_bm is not None else None,
+            'is_calculated': True  # Flag to indicate this is calculated from fund data
+        }
+    return bm_data
+# ─── CSV Loader ───────────────────────────────────────────────────────────────────
+def load_fund_csv(csv_path: str) -> Tuple[List[Fund], Dict[str, Dict[str, Any]], Dict[str, Dict[str, Any]], Dict[str, int]]:
+    """
+    Parse the fund-stats CSV and merge with reference data from Processed_data.xlsx.
+    For sections with missing reference data, auto-calculates category averages from fund data.
+    Returns: (funds, bm_data, cat_avg_data, fund_weightages)
+    """
+    csv_path = Path(csv_path)
+    if not csv_path.exists():
+        raise FileNotFoundError(f"CSV not found: {csv_path}")
+    # Load reference data from Processed_data.xlsx
+    ref_bm_data, ref_cat_avg_data, ref_fund_weightages = extract_reference_data(DEFAULT_REFERENCE_PATH)
+    funds: List[Fund] = []
+    current_category = "Unknown"
+    bm_data: Dict[str, Dict[str, Any]] = {}
+    cat_avg_data: Dict[str, Dict[str, Any]] = {}
+    with open(csv_path, encoding='utf-8-sig', errors='replace') as f:
+        reader = csv.reader(f)
+        rows = list(reader)
+    # DYNAMIC COLUMN DETECTION - Read header row first
+    if not rows:
+        raise ValueError("CSV file is empty")
+    header = [str(col).strip() for col in rows[0]]
+    col_map = {name: idx for idx, name in enumerate(header)}
+    print(f"Detected CSV format with {len(header)} columns")
+    # Detect format based on column names
+    has_category_col = 'Category' in col_map
+    has_scheme_code = 'Scheme Code' in col_map
+    if has_category_col and has_scheme_code:
+        print("  Format: NEW (36 columns with Category column)")
+    else:
+        print("  Format: OLD (35 columns without Category column)")
+    pending_bm: Dict[str, Dict[str, Any]] = {}
+    pending_cat_avg: Dict[str, Dict[str, Any]] = {}
+    seen_fund_category: set[tuple[str, str]] = set()
+    deduped_rows = 0
+    # Helper to get column index safely
+    def get_col_idx(col_name: str) -> Optional[int]:
+        return col_map.get(col_name)
+    for row_idx, row in enumerate(rows):
+        if row_idx == 0:  # Skip header row
+            continue
+        if not row:
+            continue
+        col0 = str(row[0]).strip()
+        # Category header - detect by checking if most columns are empty
+        # Category headers are standalone rows with category name in col0 and empty data columns
+        # This catches: "Equity: Large Cap", "Childrens Fund", "ETFs", "Retirement Fund", etc.
+        # But NOT "BM Index" or "Category Average" rows
+        if col0 not in ('BM Index', 'Category Average', '', 'nan'):
+            # Check if this looks like a category header (columns 2-10 are empty)
+            # For old format: check columns 2-10 (Benchmark Type is col 1, so skip it)
+            # For new format: check columns 2-10 (Category is col 1, so skip it)
+            check_cols = row[2:11] if len(row) > 10 else row[2:6]
+            non_empty_count = sum(1 for cell in check_cols if str(cell).strip() not in ('', 'nan', 'None', '-'))
+            if non_empty_count == 0 and len(col0) > 3:  # All checked columns are empty - this is a category header
+                current_category = col0
+                # Use reference data if available, otherwise use CSV data (which may be empty)
+                if current_category in ref_bm_data:
+                    pending_bm[current_category] = ref_bm_data[current_category]
+                else:
+                    pending_bm[current_category] = None
+                if current_category in ref_cat_avg_data:
+                    pending_cat_avg[current_category] = ref_cat_avg_data[current_category]
+                else:
+                    pending_cat_avg[current_category] = None
+                continue
+        # BM Index row - skip, we're using reference data
+        if col0 == 'BM Index':
+            continue
+        # Category Average row - skip, we're using reference data
+        if col0 == 'Category Average':
+            continue
+        # Skip header rows (repeated headers in CSV)
+        if col0 == 'Fund' and len(row) > 1:
+            # Check if this is a header row by looking at column 1
+            col1 = str(row[1]).strip() if len(row) > 1 else ''
+            if col1 in ('Benchmark Type', 'Category'):
+                continue
+        if col0 in ('', 'nan'):
+            continue
+        # Parse fund using dynamic column mapping
+        def g(col_name: str) -> Optional[float]:
+            idx = get_col_idx(col_name)
+            if idx is None:
+                return None
+            try:
+                return _to_float(row[idx])
+            except (IndexError, TypeError):
+                return None
+        def get_str(col_name: str) -> str:
+            idx = get_col_idx(col_name)
+            if idx is None:
+                return ""
+            try:
+                return str(row[idx]).strip()
+            except (IndexError, TypeError):
+                return ""
+        # Get category - either from Category column or from current_category
+        if has_category_col:
+            fund_category = get_str('Category') or current_category
+        else:
+            fund_category = current_category
+        # Get benchmark
+        benchmark = get_str('Benchmark Type')
+        # Get TER and Turnover with special parsing
+        ter_idx = get_col_idx('TER')
+        ter_val = _parse_ter(row[ter_idx]) if ter_idx is not None and len(row) > ter_idx else None
+        turnover_idx = get_col_idx('Turn over (%)')
+        turnover_val = _parse_turnover(row[turnover_idx]) if turnover_idx is not None and len(row) > turnover_idx else None
+        dedupe_key = (col0.strip().lower(), fund_category.strip().lower())
+        if dedupe_key in seen_fund_category:
+            deduped_rows += 1
+            continue
+        seen_fund_category.add(dedupe_key)
+        fund = Fund(
+            name=col0,
+            category=fund_category,
+            benchmark=benchmark,
+            ter=ter_val,
+            turnover=turnover_val,
+            mean=g('Mean'),
+            cagr_1y=g('1 Year CAGR'),
+            cagr_1y_cat=g('1 Year Category CAGR'),
+            cagr_1y_bm=g('1 Year Benchmark CAGR'),
+            cagr_3y=g('3 Years CAGR'),
+            cagr_3y_cat=g('3 Years Category CAGR'),
+            cagr_3y_bm=g('3 Years Benchmark CAGR'),
+            cagr_5y=g('5 Years CAGR'),
+            cagr_5y_cat=g('5 Years Category CAGR'),
+            cagr_5y_bm=g('5 Years Benchmark CAGR'),
+            cagr_10y=g('10 Years CAGR'),
+            cagr_10y_cat=g('10 Years Category CAGR'),
+            cagr_10y_bm=g('10 Years Benchmark CAGR'),
+            cagr_inception=g('CAGR Since Inception'),
+            nav=g('NAV'),
+            pe_ratio=g('P/E Ratio'),
+            pb_ratio=g('P/B Ratio'),
+            alpha=g('Alpha'),
+            beta=g('Beta'),
+            std_dev=g('Standard Deviation'),
+            sharpe=g('Sharpe Ratio'),
+            volatility=g('Volatility'),
+            sortino=g('Sortino Ratio'),
+            up_capture=g('Up Market Capture\nRatio') or g('Up Market Capture'),
+            down_capture=g('Down Market Capture\nRatio') or g('Down Market Capture'),
+            max_drawdown=g('Maximum Drawdown'),
+            r_squared=g('R-Squared'),
+            info_ratio=g('Information Ratio'),
+            aum=g('Total Assets (in Cr)'),
+            fill_status=get_str('Fill Status') or None,
+        )
+        # Preserve scheme code for downstream NAV / drawdown fixes
+        scheme_code_str = get_str('Scheme Code')
+        if scheme_code_str:
+            setattr(fund, "_scheme_code", scheme_code_str)
+        launch_dt = _parse_launch_date(get_str('Launch Date'))
+        if launch_dt:
+            setattr(fund, "_launch_date", launch_dt)
+        fund.order = len(funds)  # Preserve original CSV order for tiebreaker
+        funds.append(fund)
+    if deduped_rows:
+        print(f"   Deduplicated {deduped_rows} rows by (Fund, Category) at ingest")
+    # Calculate category averages from fund data
+    calculated_cat_avg = _calculate_category_averages(funds)
+    # Calculate BM Index from fund-level benchmark data
+    calculated_bm = _calculate_benchmark_index(funds)
+    # Assign BM and Category Average data - ONLY use calculated data from CSV
+    # DO NOT use reference data from Processed_data.xlsx
+    for cat_name in set(f.category for f in funds):
+        # BM Index: Always use calculated data from fund benchmark values
+        bm_data[cat_name] = calculated_bm.get(cat_name, {})
+        # Category Average: Always use calculated data from fund category values
+        cat_avg_data[cat_name] = calculated_cat_avg.get(cat_name, {})
+    return funds, bm_data, cat_avg_data, ref_fund_weightages
+def _fmt(val, decimals=2) -> Optional[float]:
+    """Return rounded float or None."""
+    if val is None:
+        return None
+    try:
+        return round(float(val), decimals)
+    except (ValueError, TypeError):
+        return None
+def _quartile_band_for_position(pos: int, total: int) -> Optional[int]:
+    """
+    Return quartile band by positional rank (0-based) after sorting by score desc.
+    Band mapping:
+    - 0: Top quartile (Green)
+    - 1: Upper-middle quartile (Yellow)
+    - 2: Lower-middle quartile (Orange)
+    - 3: Bottom quartile (Red)
+    Uses rank-positioning (not score thresholds), so ties do not distort quartile sizes.
+    """
+    if total <= 0 or pos < 0 or pos >= total:
+        return None
+    # Keep intuitive behavior for tiny categories.
+    if total == 1:
+        return 0
+    if total == 2:
+        return 0 if pos == 0 else 3
+    if total == 3:
+        if pos == 0:
+            return 0
+        if pos == 1:
+            return 1
+        return 3
+    q1_end = math.ceil(total * 0.25)
+    q2_end = math.ceil(total * 0.50)
+    q3_end = math.ceil(total * 0.75)
+    if pos < q1_end:
+        return 0
+    if pos < q2_end:
+        return 1
+    if pos < q3_end:
+        return 2
+    return 3
+def _calculate_weightage(fund: Fund, cat_avg_vals: Dict[str, Any]) -> int:
+    """
+    DEPRECATED: Legacy CAGR-based weightage calculation.
+    Use compute_scores() from weightage.py for AI-suggested model.
+    Calculate weightage based on period-weighted scoring against Category Average.
+    Period weights:
+    - 1 Year CAGR: 2 pts if fund beats Category Average
+    - 3 Years CAGR: 3 pts if fund beats Category Average
+    - 5 Years CAGR: 4 pts if fund beats Category Average
+    - 10 Years CAGR: 5 pts if fund beats Category Average
+    Max possible: 14 pts
+    Note: Treat 0, N/A*, or - as "no data" (skip comparison)
+    """
+    weightage = 0
+    # Period weights mapping
+    period_weights = {
+        'cagr_1y': 2,
+        'cagr_3y': 3,
+        'cagr_5y': 4,
+        'cagr_10y': 5,
+    }
+    for attr, weight in period_weights.items():
+        fund_val = getattr(fund, attr, None)
+        cat_avg_val = cat_avg_vals.get(attr) if cat_avg_vals else None
+        # Skip if fund value is 0, None, or invalid
+        if fund_val is None or fund_val == 0:
+            continue
+        if cat_avg_val is None or cat_avg_val == 0:
+            continue
+        # Award points if fund beats category average
+        if fund_val > cat_avg_val:
+            weightage += weight
+    return weightage
+def _calculate_green_cell_weightage(fund: Fund, all_funds_in_category: List[Fund]) -> int:
+    """
+    Calculate weightage as the count of GREEN cells (top 10 rankings).
+    Matches Excel conditional formatting rules:
+    - Only metrics with GREEN highlighting are counted
+    - Bottom 10 metrics get RED highlighting (not counted)
+    GREEN metrics (Top 10 = Green):
+    - CAGR columns: F, G, H, I (1Y, 3Y, 5Y, 10Y)
+    - Top 10 columns: J, N, R, S, T, X, Y (Inception, Alpha, Sharpe, Sortino, UpCapture, InfoRatio, Assets)
+    Total possible: 11 green cells
+    """
+    green_count = 0
+    # Only metrics that get GREEN highlighting in Excel (Top 10 = Green)
+    green_metrics = [
+        'cagr_1y',        # Column F
+        'cagr_3y',        # Column G
+        'cagr_5y',        # Column H
+        'cagr_10y',       # Column I
+        'cagr_inception', # Column J
+        'alpha',          # Column N
+        'sharpe',         # Column R
+        'sortino',        # Column S
+        'up_capture',     # Column T
+        'info_ratio',     # Column X
+        'aum'             # Column Y (Assets)
+    ]
+    # Check each metric that gets GREEN highlighting
+    for metric in green_metrics:
+        if _is_in_top_10(fund, all_funds_in_category, metric, higher_is_better=True):
+            green_count += 1
+    return green_count
+def _is_in_top_10(fund: Fund, all_funds: List[Fund], metric: str, higher_is_better: bool) -> bool:
+    """
+    Check if a fund is in top 10 for a given metric within its category.
+    Args:
+        fund: The fund to check
+        all_funds: All funds in the same category
+        metric: The metric attribute name (e.g., 'cagr_1y', 'ter')
+        higher_is_better: True if higher values are better, False if lower is better
+    Returns: True if fund is in top 10, False otherwise
+    """
+    fund_val = getattr(fund, metric, None)
+    # Skip if fund doesn't have this metric
+    if fund_val is None or fund_val == 0:
+        return False
+    # Collect all valid values for this metric in the category
+    valid_values = []
+    for f in all_funds:
+        val = getattr(f, metric, None)
+        if val is not None and val != 0:
+            valid_values.append(val)
+    # Need at least 10 funds with data to have a top 10
+    if len(valid_values) < 10:
+        # If fewer than 10 funds, check if fund is in top half
+        if len(valid_values) < 2:
+            return False
+        valid_values.sort(reverse=higher_is_better)
+        threshold_idx = len(valid_values) // 2
+        threshold = valid_values[threshold_idx]
+        if higher_is_better:
+            return fund_val >= threshold
+        else:
+            return fund_val <= threshold
+    # Sort values to find top 10 threshold
+    valid_values.sort(reverse=higher_is_better)
+    # Count how many funds are strictly better than this fund
+    if higher_is_better:
+        better_count = sum(1 for v in valid_values if v > fund_val)
+    else:
+        better_count = sum(1 for v in valid_values if v < fund_val)
+    # Fund is in top 10 if 9 or fewer funds are strictly better (ranks 1-10)
+    return better_count <= 9
+def _get_cagr_font_color() -> Font:
+    """
+    NO font coloring - always return default black font.
+    Per instructions: "CRITICAL: NO green/red font coloring anywhere"
+    """
+    return FONT_DEFAULT
+def _apply_conditional_formatting(ws, start_row: int, end_row: int, cat_avg_vals: Dict[str, Any]):
+    """
+    Apply conditional formatting rules per MF_Scoring_Model.md
+    Light Green (C6EFCE) + Dark Green Text (006100) for:
+    - Top 10: CAGR (all periods), Alpha, Sharpe, Sortino, Up Capture, R-Squared, Info Ratio, Total Assets, CAGR Since Inception
+    - Bottom 10: TER, Turnover, Beta, Std Dev, Down Capture, P/E, P/B, Max Drawdown
+    Light Red (FFC7CE) for threshold violations:
+    - Alpha < 1
+    - Info Ratio < 0
+    - CAGR < Category Average (all periods)
+    """
+    if start_row >= end_row:
+        return
+    # Define colors for conditional formatting
+    green_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
+    green_font = Font(color="006100")
+    red_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
+    red_font = Font(color="9C0006")
+    # ═══════════════════════════════════════════════════════════════════════════
+    # DUAL-CONDITION COLUMNS (Green for Top 10, Red for threshold violations)
+    # ═══════════════════════════════════════════════════════════════════════════
+    # CAGR columns: Green for Top 10, Red if < Category Average
+    cagr_cols = {
+        'F': (6, cat_avg_vals.get('cagr_1y')),    # 1 Year CAGR
+        'G': (7, cat_avg_vals.get('cagr_3y')),    # 3 Years CAGR
+        'H': (8, cat_avg_vals.get('cagr_5y')),    # 5 Years CAGR
+        'I': (9, cat_avg_vals.get('cagr_10y')),   # 10 Years CAGR
+    }
+    for col_letter, (col_num, cat_avg) in cagr_cols.items():
+        range_str = f"{col_letter}{start_row}:{col_letter}{end_row}"
+        # Rule 1: Red if < Category Average (higher priority)
+        if cat_avg is not None:
+            rule_red = CellIsRule(
+                operator='lessThan',
+                formula=[str(cat_avg)],
+                stopIfTrue=True,  # Stop if red applies
+                fill=red_fill,
+                font=red_font
+            )
+            ws.conditional_formatting.add(range_str, rule_red)
+        # Rule 2: Green for Top 10
+        rule_green = Rule(
+            type='top10',
+            rank=10,
+            stopIfTrue=False
+        )
+        rule_green.dxf = DifferentialStyle(fill=green_fill, font=green_font)
+        ws.conditional_formatting.add(range_str, rule_green)
+    # Alpha (Col N = 14): Green for Top 10, Red if < 1
+    range_str = f"N{start_row}:N{end_row}"
+    rule_red = CellIsRule(
+        operator='lessThan',
+        formula=['1'],
+        stopIfTrue=True,
+        fill=red_fill,
+        font=red_font
+    )
+    ws.conditional_formatting.add(range_str, rule_red)
+    rule_green = Rule(type='top10', rank=10, stopIfTrue=False)
+    rule_green.dxf = DifferentialStyle(fill=green_fill, font=green_font)
+    ws.conditional_formatting.add(range_str, rule_green)
+    # Information Ratio (Col X = 24): Green for Top 10, Red if < 0
+    range_str = f"X{start_row}:X{end_row}"
+    rule_red = CellIsRule(
+        operator='lessThan',
+        formula=['0'],
+        stopIfTrue=True,
+        fill=red_fill,
+        font=red_font
+    )
+    ws.conditional_formatting.add(range_str, rule_red)
+    rule_green = Rule(type='top10', rank=10, stopIfTrue=False)
+    rule_green.dxf = DifferentialStyle(fill=green_fill, font=green_font)
+    ws.conditional_formatting.add(range_str, rule_green)
+    # ═══════════════════════════════════════════════════════════════════════════
+    # TOP 10 COLUMNS (Green - Higher is Better)
+    # ═══════════════════════════════════════════════════════════════════════════
+    top10_cols = {
+        'J': 'CAGR Since Inception',
+        'R': 'Sharpe Ratio',
+        'S': 'Sortino Ratio',
+        'T': 'Up Market Capture',
+        'W': 'R-Squared',
+        'Y': 'Total Assets'
+    }
+    for col_letter, name in top10_cols.items():
+        range_str = f"{col_letter}{start_row}:{col_letter}{end_row}"
+        rule = Rule(type='top10', rank=10, stopIfTrue=False)
+        rule.dxf = DifferentialStyle(fill=green_fill, font=green_font)
+        ws.conditional_formatting.add(range_str, rule)
+    # Maximum Drawdown (Col V): Top 10 among NON-ZERO values only.
+    # This keeps zeros as "no data" and avoids green highlighting for zero entries.
+    v_range = f"V{start_row}:V{end_row}"
+    # Guard against text placeholders like "NA": Excel treats "NA" <> 0 as TRUE,
+    # which can incorrectly qualify the cell for highlighting. Only numeric values participate.
+    v_formula = (
+        f'AND('
+        f'ISNUMBER(V{start_row}),'
+        f'V{start_row}<>0,'
+        f'COUNTIFS($V${start_row}:$V${end_row},\">\"&V{start_row},$V${start_row}:$V${end_row},\"<>0\")<10'
+        f')'
+    )
+    v_rule = FormulaRule(formula=[v_formula], stopIfTrue=False, fill=green_fill, font=green_font)
+    ws.conditional_formatting.add(v_range, v_rule)
+    # ═══════════════════════════════════════════════════════════════════════════
+    # BOTTOM 10 COLUMNS (Green - Lower is Better)
+    # ═══════════════════════════════════════════════════════════════════════════
+    bottom10_cols = {
+        'C': 'TER',
+        'D': 'Turnover',
+        'L': 'P/E Ratio',
+        'P': 'Beta',
+        'Q': 'Standard Deviation',
+        'U': 'Down Market Capture'
+    }
+    for col_letter, name in bottom10_cols.items():
+        range_str = f"{col_letter}{start_row}:{col_letter}{end_row}"
+        rule = Rule(
+            type='top10',
+            rank=10,
+            bottom=True,  # Bottom 10 = lowest values
+            stopIfTrue=False
+        )
+        rule.dxf = DifferentialStyle(fill=green_fill, font=green_font)
+        ws.conditional_formatting.add(range_str, rule)
+def export_excel(funds: List[Fund], output_path: str,
+                 bm_data: Dict[str, Dict[str, Any]] = None,
+                 cat_avg_data: Dict[str, Dict[str, Any]] = None) -> str:
+    """Build the processed Excel matching target format exactly."""
+    output_path = Path(output_path)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    if bm_data is None:
+        bm_data = {}
+    if cat_avg_data is None:
+        cat_avg_data = {}
+    wb = Workbook()
+    ws = wb.active
+    ws.title = "Sheet2"
+    na_audit_rows: List[str] = []
+    # Apply NA policy to all numeric export columns.
+    # Exclusions are text/derived columns that should stay as-is.
+    na_on_zero_attrs = {
+        attr for _, attr, _, _, _ in XLSX_COLUMNS
+        if attr and attr not in {"name", "benchmark", "weightage"}
+    }
+    cagr_period_by_attr = {
+        "cagr_1y": 1,
+        "cagr_3y": 3,
+        "cagr_5y": 5,
+        "cagr_10y": 10,
+    }
+    def _years_since_launch(fund_obj: Fund) -> Optional[float]:
+        launch_dt = getattr(fund_obj, "_launch_date", None)
+        if not isinstance(launch_dt, datetime):
+            return None
+        return max(0.0, (datetime.now() - launch_dt).days / 365.25)
+    def _audit_na(row_type: str, category: str, fund_name: str, attr: str, reason: str) -> None:
+        na_audit_rows.append(
+            f"{row_type}\t{category}\t{fund_name}\t{attr}\t{reason}"
+        )
+    def _display_numeric_or_na(
+        *,
+        attr: str,
+        value: Any,
+        row_type: str,
+        category: str,
+        fund_obj: Optional[Fund] = None,
+        fund_name: str = "",
+        decimals: int = 2,
+    ) -> Any:
+        """
+        Convert numeric value to rounded float or 'NA' for missing/invalid values.
+        Also appends NA decisions to audit rows.
+        Category Average: PE and PB show blank (not NA) when missing.
+        """
+        # Category Average row: PE and PB stay blank when missing
+        if row_type == "CATEGORY_AVG" and attr in ("pe_ratio", "pb_ratio"):
+            if value is None:
+                return None
+            try:
+                num = float(value)
+                return round(num, decimals) if num != 0 else None
+            except (TypeError, ValueError):
+                return None
+        if attr in na_on_zero_attrs:
+            if value is None:
+                _audit_na(row_type, category, fund_name, attr, "missing value")
+                return "NA"
+            try:
+                num = float(value)
+            except (TypeError, ValueError):
+                _audit_na(row_type, category, fund_name, attr, "non-numeric value")
+                return "NA"
+            if num == 0:
+                # Duration-aware reason for CAGR periods when launch date exists.
+                if fund_obj is not None and attr in cagr_period_by_attr:
+                    years = _years_since_launch(fund_obj)
+                    period = cagr_period_by_attr[attr]
+                    if years is not None and years < period:
+                        _audit_na(
+                            row_type,
+                            category,
+                            fund_name,
+                            attr,
+                            f"fund age {years:.2f}y < required {period}y",
+                        )
+                    else:
+                        _audit_na(row_type, category, fund_name, attr, "source value is 0")
+                else:
+                    _audit_na(row_type, category, fund_name, attr, "source value is 0")
+                return "NA"
+            return round(num, decimals)
+        # Non-NA-managed attributes use existing behavior.
+        if value is None:
+            return None
+        try:
+            return round(float(value), decimals)
+        except (TypeError, ValueError):
+            return value
+    # ── Row 1: Column headers (include weight hints for scored metrics) ─────
+    ws.row_dimensions[1].height = 36
+    for col_idx, (header, attr, width, _, _) in enumerate(XLSX_COLUMNS, start=1):
+        # If this column participates in the scoring model, append its weight
+        # so the advisor can see weights even when scrolled deep into a category.
+        weight_hint = WEIGHT_REF_ROW.get(attr)
+        if weight_hint:
+            header_value = f"{header}\n({weight_hint})"
+        else:
+            header_value = header
+        cell = ws.cell(row=1, column=col_idx, value=header_value)
+        cell.fill = FILL_HEADER
+        cell.font = FONT_HEADER
+        cell.border = BORDER_THIN
+        cell.alignment = Alignment(horizontal="center", vertical="center", wrap_text=True)
+        ws.column_dimensions[get_column_letter(col_idx)].width = width
+    # Freeze col A + row 1 so fund names and headers stay visible while scrolling
+    ws.freeze_panes = "B2"
+    # ── Group funds by category ────────────────────────────────────────────────
+    categories: Dict[str, List[Fund]] = {}
+    category_order = []
+    for fund in funds:
+        if fund.category not in categories:
+            category_order.append(fund.category)
+        categories.setdefault(fund.category, []).append(fund)
+    current_row = 2
+    for idx, cat_name in enumerate(category_order):
+        cat_funds = categories[cat_name]
+        # Sort by score (displayed value) descending so Weightage column is strictly largest-to-lowest
+        sorted_funds = sorted(
+            cat_funds,
+            key=lambda f: (-(f.score or 0), (f.name or "").lower(), getattr(f, 'order', 0)),
+        )
+        # Quartiles by positional rank, not by score thresholds.
+        # This guarantees consistent quartile sizing even when many funds share the same score.
+        quartile_by_fund_id: Dict[int, int] = {}
+        for pos, fund in enumerate(sorted_funds):
+            band = _quartile_band_for_position(pos, len(sorted_funds))
+            if band is not None:
+                quartile_by_fund_id[id(fund)] = band
+        # ── Header row (repeat before each category except first) ─────────────
+        if idx > 0:
+            ws.row_dimensions[current_row].height = 32
+            for col_idx, (header, _, _, _, _) in enumerate(XLSX_COLUMNS, start=1):
+                cell = ws.cell(row=current_row, column=col_idx, value=header)
+                cell.fill = FILL_HEADER
+                cell.font = FONT_HEADER
+                cell.border = BORDER_THIN
+                cell.alignment = Alignment(horizontal="center", vertical="center", wrap_text=True)
+            current_row += 1
+        # ── Category header row ───────────────────────────────────────────────
+        ws.row_dimensions[current_row].height = 20
+        for col_idx in range(1, NUM_COLS + 1):
+            cell = ws.cell(row=current_row, column=col_idx)
+            cell.fill = FILL_WHITE
+            cell.border = BORDER_THIN
+        cat_cell = ws.cell(row=current_row, column=1, value=cat_name)
+        cat_cell.font = FONT_CAT_HEADER
+        cat_cell.alignment = Alignment(horizontal="left", vertical="center", wrap_text=True)
+        ws.merge_cells(start_row=current_row, start_column=1,
+                       end_row=current_row, end_column=NUM_COLS - 1)
+        current_row += 1
+        # ── BM Index row ───────────────────────────────────────────────────────
+        bm_vals = bm_data.get(cat_name, {})
+        ws.row_dimensions[current_row].height = 14
+        for col_idx, (header, attr, _, _, _) in enumerate(XLSX_COLUMNS, start=1):
+            val = None
+            if col_idx == 1:
+                val = "BM Index"
+            elif attr in bm_vals:
+                val = _display_numeric_or_na(
+                    attr=attr,
+                    value=bm_vals[attr],
+                    row_type="BM_INDEX",
+                    category=cat_name,
+                    fund_name="BM Index",
+                    decimals=2,
+                )
+            cell = ws.cell(row=current_row, column=col_idx, value=val)
+            if col_idx == 1:
+                cell.fill = FILL_BM_ROW
+            elif col_idx in [6, 7, 8, 9]:
+                cell.fill = FILL_BM_CAGR
+            else:
+                cell.fill = FILL_WHITE
+            cell.font = FONT_DEFAULT_BOLD
+            cell.border = BORDER_THIN
+            cell.alignment = Alignment(
+                horizontal="right" if col_idx > 2 else "left",
+                vertical="center", wrap_text=(col_idx == 1)
+            )
+        current_row += 1
+        # ── Category Average row ──────────────────────────────────────────────
+        cat_avg_vals = cat_avg_data.get(cat_name, {})
+        ws.row_dimensions[current_row].height = 14
+        for col_idx, (header, attr, _, _, _) in enumerate(XLSX_COLUMNS, start=1):
+            val = None
+            if col_idx == 1:
+                val = "Category Average"
+            elif attr in cat_avg_vals:
+                val = _display_numeric_or_na(
+                    attr=attr,
+                    value=cat_avg_vals[attr],
+                    row_type="CATEGORY_AVG",
+                    category=cat_name,
+                    fund_name="Category Average",
+                    decimals=2,
+                )
+            cell = ws.cell(row=current_row, column=col_idx, value=val)
+            if col_idx == 1:
+                cell.fill = FILL_CAT_AVG
+            elif col_idx in [6, 7, 8, 9, 12, 13]:
+                cell.fill = FILL_CAT_CAGR
+            else:
+                cell.fill = FILL_WHITE
+            cell.font = FONT_DEFAULT_BOLD
+            cell.border = BORDER_THIN
+            cell.alignment = Alignment(
+                horizontal="right" if col_idx > 2 else "left",
+                vertical="center", wrap_text=(col_idx == 1)
+            )
+        current_row += 1
+        # ── Fund rows ─────────────────────────────────────────────────────────
+        fund_start_row = current_row
+        top_5_fund_ids = {id(f) for f in sorted_funds[:5]}
+        for fund in sorted_funds:
+            # 36pt height = comfortable 2-line display for long fund names
+            # without the advisor needing to drag rows on macOS or Windows
+            ws.row_dimensions[current_row].height = 36
+            weightage  = fund.score or 0
+            score_val  = round(weightage, 3)
+            is_top_5   = id(fund) in top_5_fund_ids
+            for col_idx, (header, attr, _, _, decimals) in enumerate(XLSX_COLUMNS, start=1):
+                if attr == "weightage":
+                    val = score_val
+                    cell_font = FONT_DEFAULT_BOLD if is_top_5 else FONT_DEFAULT
+                elif attr:
+                    raw_val = getattr(fund, attr, None)
+                    if attr in ('name', 'benchmark'):
+                        val = raw_val if raw_val else None
+                        cell_font = FONT_DEFAULT_BOLD if (col_idx == 1 and is_top_5) else FONT_DEFAULT
+                    else:
+                        val = _display_numeric_or_na(
+                            attr=attr,
+                            value=raw_val,
+                            row_type="FUND",
+                            category=fund.category,
+                            fund_obj=fund,
+                            fund_name=fund.name,
+                            decimals=decimals,
+                        )
+                        cell_font = FONT_DEFAULT
+                else:
+                    val = None
+                    cell_font = FONT_DEFAULT
+                cell = ws.cell(row=current_row, column=col_idx, value=val)
+                if is_top_5 and col_idx == 1:
+                    cell.fill = FILL_WEIGHTED_YELLOW
+                elif attr == "weightage":
+                    quartile_band = quartile_by_fund_id.get(id(fund))
+                    if quartile_band == 0:   cell.fill = FILL_QUARTILE_GREEN
+                    elif quartile_band == 1: cell.fill = FILL_QUARTILE_YELLOW
+                    elif quartile_band == 2: cell.fill = FILL_QUARTILE_ORANGE
+                    elif quartile_band == 3: cell.fill = FILL_QUARTILE_RED
+                    else:                    cell.fill = FILL_WHITE
+                else:
+                    cell.fill = FILL_WHITE
+                cell.font   = cell_font
+                cell.border = BORDER_THIN
+                cell.alignment = Alignment(
+                    horizontal="left" if col_idx <= 2 else "right",
+                    vertical="top",    # top-align so wrapped text reads naturally
+                    wrap_text=True,    # prevents truncation on any screen or zoom level
+                )
+                if col_idx == 3:          cell.number_format = '0.00%'
+                elif col_idx == 4:        cell.number_format = '0.00%'
+                elif attr == "weightage": cell.number_format = '0.000'
+            current_row += 1
+        # Apply conditional formatting to this section's fund rows
+        fund_end_row = current_row - 1
+        if fund_end_row >= fund_start_row and cat_avg_vals:
+            _apply_conditional_formatting(ws, fund_start_row, fund_end_row, cat_avg_vals)
+    wb.save(str(output_path))
+    if na_audit_rows:
+        audit_path = output_path.with_name(f"{output_path.stem}_na_audit.txt")
+        lines = [
+            "NA AUDIT TRACE",
+            f"Generated: {datetime.now().isoformat()}",
+            "Columns: row_type<TAB>category<TAB>fund_name<TAB>metric_attr<TAB>reason",
+            "-" * 80,
+            *na_audit_rows,
+        ]
+        audit_path.write_text("\n".join(lines), encoding="utf-8")
+        print(f"NA audit trace written: {audit_path}")
+    return str(output_path)
+def _avg(values: List[Optional[float]]) -> Optional[float]:
+    """Compute average of non-None values."""
+    valid = [v for v in values if v is not None]
+    if not valid:
+        return None
+    return round(sum(valid) / len(valid), 2)
+# ─── Pipeline entry ────────────────────────────────────────────────────────────────
+def run_data_engine(csv_path: str,
+                    output_path: str = "output/fund_analysis.xlsx",
+                    use_comprehensive_scoring: bool = True) -> List[Fund]:
+    """
+    Full pipeline: load -> score -> export Excel.
+    Args:
+        csv_path: Path to the fund-stats CSV file
+        output_path: Path to save the output Excel file
+        use_comprehensive_scoring: If True, uses AI-suggested model (10-point scale with Top/Bottom 10).
+                                   If False, uses legacy CAGR-based weightage.
+    """
+    print(f"Loading fund data from: {csv_path}")
+    funds, bm_data, cat_avg_data, ref_fund_weightages = load_fund_csv(csv_path)
+    print(f"   Loaded {len(funds)} fund schemes")
+    # Proactively fix zero / missing drawdown cells using live NAV history
+    # so Maximum Drawdown can participate in scoring instead of staying at 0.
+    try:
+        fixed_mdd = drawdown_zero_fix(funds, verbose=True)
+        if fixed_mdd:
+            print(f"   Fixed {fixed_mdd} zero/missing drawdown cells via NAV engine")
+    except Exception as exc:
+        print(f"   WARNING: drawdown_zero_fix failed: {exc}")
+    if use_comprehensive_scoring:
+        # Use AI-suggested model (10-point scale)
+        print("   Using AI-suggested scoring model (10-point scale with Top/Bottom 10)...")
+        # Import and use the new compute_scores function
+        funds = compute_scores(funds)
+        # Copy score to weightage field for Excel export compatibility
+        for fund in funds:
+            fund.weightage = int(round(fund.score)) if fund.score else 0
+        with_highlight = sum(1 for f in funds if (f.score or 0) > 8)
+        print(f"   Calculated AI-suggested weightage. {with_highlight} funds have score > 8")
+    else:
+        # Use legacy CAGR-based weightage
+        print("   Using legacy CAGR-based weightage...")
+        for fund in funds:
+            cat_avg_vals = cat_avg_data.get(fund.category, {})
+            fund.weightage = _calculate_weightage(fund, cat_avg_vals)
+            fund.score = float(fund.weightage)
+        with_highlight = sum(1 for f in funds if (f.weightage or 0) > 8)
+        print(f"   Calculated weightage. {with_highlight} funds have weightage > 8")
+    print(f"Exporting processed Excel to: {output_path}")
+    path = export_excel(funds, output_path, bm_data, cat_avg_data)
+    print(f"Done! Saved: {path}")
+    return funds

src/index_fund_ingest.py ADDED Viewed

	@@ -0,0 +1,354 @@

+"""
+Index Fund Ingest — capture index funds the same way as raw CSV (mftool/AMFI).
+Two sources:
+  - mftool (default): Same as raw CSV under PS — AMFI category 38 (Index Funds/ETFs).
+    Returns only the schemes AMFI lists under that category (curated, ~same count as
+    your fund-stats CSV Index Fund section). Output format matches PS: "Index Fund",
+    hyphenated fund names.
+  - mfapi: Search mfapi.in and filter by index; use when you need more schemes.
+Usage:
+  python -m src.index_fund_ingest [--output index_funds.csv]           # default: mftool
+  python -m src.index_fund_ingest --source mfapi [--limit 100]         # mfapi search
+  Then: enrich the output CSV, merge into main fund CSV, run data_engine as usual.
+"""
+from __future__ import annotations
+import argparse
+import csv
+import re
+import time
+from datetime import datetime, timedelta
+from pathlib import Path
+import requests
+# Same AMFI gateway as mftool (get_open_ended_other_scheme_performance)
+AMFI_FUND_PERFORMANCE_URL = "https://www.amfiindia.com/gateway/pollingsebi/api/amfi/fundperformance"
+AMFI_CATEGORY_OTHER = 5
+AMFI_SUBCATEGORY_INDEX_FUNDS = 38  # "Index Funds/ETFs"
+MFAPI_LIST = "https://api.mfapi.in/mf"
+MFAPI_SEARCH = "https://api.mfapi.in/mf/search"
+MFAPI_NAV = "https://api.mfapi.in/mf/{scheme_code}"
+SLEEP = 0.3  # polite delay between API calls
+# CSV headers matching project fund-stats CSV (must match data_engine / csv_enrichment)
+FUND_CSV_HEADERS = [
+    "Fund", "Category", "Scheme Code", "Launch Date", "Total Assets (in Cr)",
+    "TER", "Turn over (%)", "CAGR Since Inception",
+    "1 Year CAGR", "1 Year Category CAGR", "1 Year Benchmark CAGR",
+    "3 Years CAGR", "3 Years Category CAGR", "3 Years Benchmark CAGR",
+    "5 Years CAGR", "5 Years Category CAGR", "5 Years Benchmark CAGR",
+    "10 Years CAGR", "10 Years Category CAGR", "10 Years Benchmark CAGR",
+    "Benchmark Type", "NAV", "Alpha", "Beta", "Standard Deviation",
+    "Sharpe Ratio", "Volatility", "Mean", "Sortino Ratio",
+    "Up Market Capture\nRatio", "Down Market Capture\nRatio",
+    "Maximum Drawdown", "R-Squared", "Information Ratio", "P/E Ratio", "P/B Ratio",
+]
+# Raw CSV under PS uses "Index Fund" (no "Equity:" prefix) for this category
+INDEX_FUND_CATEGORY_PS = "Index Fund"
+# mfapi scheme_category (from NAV meta) -> our Category label
+CATEGORY_MAP = {
+    "index fund": "Equity: Index Fund",
+    "index funds": "Equity: Index Fund",
+    "equity scheme - index fund": "Equity: Index Fund",
+    "equity scheme - index funds": "Equity: Index Fund",
+}
+def _to_hyphenated(name: str) -> str:
+    """Convert scheme name to hyphenated form like raw CSV under PS (e.g. DSP-Nifty-50-Index-Fund-Regular-Plan-Growth)."""
+    if not name:
+        return ""
+    # Replace spaces and multiple hyphens with single hyphen, strip
+    s = re.sub(r"[\s_]+", "-", name.strip())
+    return re.sub(r"-+", "-", s).strip("-")
+def _get_amfi_report_date() -> str:
+    """DD-MMM-YYYY for AMFI API. Use last weekday (API returns empty for weekend dates)."""
+    today = datetime.now().date()
+    d = today
+    for _ in range(7):
+        if d.weekday() < 5:  # Mon=0 .. Fri=4
+            break
+        d -= timedelta(days=1)
+    return d.strftime("%d-%b-%Y")
+# Scheme name fragments -> Benchmark Type (for nav_metrics_engine)
+# Order matters: more specific (e.g. Nifty 500) before generic (Nifty 50)
+BENCHMARK_INFER = [
+    (r"nifty\s*500|nifty500", "Nifty 500"),
+    (r"nifty\s*200|nifty200", "Nifty 200"),
+    (r"nifty\s*100|nifty100", "Nifty 100"),
+    (r"nifty\s*next\s*50|nifty\s*junior|niftyjr", "Nifty Next 50"),
+    (r"nifty\s*50|nifty50", "Nifty 50"),
+    (r"nifty\s*midcap\s*150|midcap\s*150", "Nifty Midcap 150"),
+    (r"nifty\s*smallcap\s*250|smallcap\s*250", "Nifty Smallcap 250"),
+    (r"sensex|bse\s*sensex", "BSE Sensex"),
+    (r"bse\s*100", "BSE 100"),
+    (r"bse\s*500", "BSE 500"),
+]
+def _normalize_category(meta_category: str | None) -> str:
+    if not meta_category:
+        return "Equity: Index Fund"
+    key = meta_category.strip().lower()
+    for k, v in CATEGORY_MAP.items():
+        if k in key:
+            return v
+    if "index" in key:
+        return "Equity: Index Fund"
+    return meta_category.strip()
+def _infer_benchmark(scheme_name: str) -> str:
+    name = (scheme_name or "").lower()
+    for pattern, bench in BENCHMARK_INFER:
+        if re.search(pattern, name):
+            return bench
+    return "Nifty 50"  # safe default for index funds
+def _search_mfapi(query: str, limit: int = 200) -> list[dict]:
+    """Return list of {schemeCode, schemeName} from mfapi search."""
+    try:
+        resp = requests.get(MFAPI_SEARCH, params={"q": query}, timeout=15)
+        resp.raise_for_status()
+        data = resp.json()
+        if isinstance(data, list):
+            return data[:limit]
+        return []
+    except Exception as e:
+        print(f"  [search] error for '{query}': {e}")
+        return []
+def _fetch_nav_meta(scheme_code: str) -> dict | None:
+    """Fetch NAV endpoint and return meta only (scheme_name, scheme_category)."""
+    url = MFAPI_NAV.format(scheme_code=scheme_code)
+    try:
+        resp = requests.get(url, params={"limit": 1}, timeout=15)
+        resp.raise_for_status()
+        data = resp.json()
+        meta = data.get("meta") or {}
+        return {
+            "scheme_name": meta.get("scheme_name") or "",
+            "scheme_category": meta.get("scheme_category") or "",
+            "fund_house": meta.get("fund_house") or "",
+        }
+    except Exception as e:
+        print(f"  [nav meta] {scheme_code}: {e}")
+        return None
+def get_index_funds_via_mftool(verbose: bool = True) -> list[dict]:
+    """
+    Fetch index funds from the same AMFI API used by mftool (category 5, subCategory 38).
+    Returns the same curated list as would appear in the raw CSV under PS — not 10k schemes.
+    Each item: scheme_name, benchmark_type. Scheme code is left blank; enrichment will resolve.
+    """
+    out: list[dict] = []
+    base_date = datetime.now().date()
+    for day_back in range(8):  # try up to 8 days back to get a date with data
+        d = base_date - timedelta(days=day_back)
+        if d.weekday() >= 5:  # skip weekend
+            continue
+        report_date = d.strftime("%d-%b-%Y")
+        payload = {
+            "maturityType": 1,
+            "category": AMFI_CATEGORY_OTHER,
+            "subCategory": AMFI_SUBCATEGORY_INDEX_FUNDS,
+            "mfid": 0,
+            "reportDate": report_date,
+        }
+        try:
+            resp = requests.post(
+                AMFI_FUND_PERFORMANCE_URL,
+                headers={"User-Agent": "Mozilla/5.0"},
+                json=payload,
+                timeout=25,
+            )
+            resp.raise_for_status()
+            data = resp.json()
+            raw_list = data.get("data") or []
+            for item in raw_list:
+                name = (item.get("schemeName") or "").strip()
+                if not name:
+                    continue
+                # Exclude ETFs so we match raw CSV (Index Fund section has open-ended funds only)
+                if " ETF" in name or name.endswith(" ETF"):
+                    continue
+                benchmark = (item.get("benchmark") or "").strip() or "Nifty 50"
+                out.append({
+                    "scheme_name": name,
+                    "benchmark_type": benchmark,
+                    "scheme_code": "",  # AMFI API doesn't return code; enrichment resolves
+                    "category": INDEX_FUND_CATEGORY_PS,
+                })
+            if out:
+                if verbose:
+                    print(f"[mftool] AMFI category 38 (Index Funds/ETFs): {len(out)} schemes (report date {report_date})")
+                break
+        except Exception as e:
+            if verbose and day_back == 0:
+                print(f"[mftool] AMFI request failed for {report_date}: {e}")
+            continue
+    if not out and verbose:
+        print("[mftool] No schemes returned (tried several weekdays). Check AMFI API.")
+    return out
+def _is_index_scheme(meta_category: str, scheme_name: str) -> bool:
+    """True if this scheme should be treated as index fund."""
+    cat = (meta_category or "").lower()
+    name = (scheme_name or "").lower()
+    if "index" in cat:
+        return True
+    if "index" in name and ("fund" in name or "etf" not in name):
+        return True
+    # Explicit index benchmarks in name
+    if re.search(r"nifty\s*50|nifty\s*next\s*50|sensex|nifty\s*100|nifty\s*500", name):
+        return True
+    return False
+def discover_index_schemes(
+    search_queries: list[str] | None = None,
+    limit_per_query: int = 150,
+    require_index_category: bool = True,
+    verbose: bool = True,
+) -> list[dict]:
+    """
+    Discover index fund schemes via mfapi search and NAV meta.
+    Returns list of dicts: scheme_code, scheme_name, category, benchmark_type.
+    """
+    if search_queries is None:
+        search_queries = ["Index", "Index Fund", "Nifty 50", "Nifty Next 50", "Sensex"]
+    seen_codes: set[int] = set()
+    out: list[dict] = []
+    for q in search_queries:
+        if verbose:
+            print(f"[discover] search q={q!r} …")
+        candidates = _search_mfapi(q, limit=limit_per_query)
+        for item in candidates:
+            code = item.get("schemeCode")
+            if code is None or code in seen_codes:
+                continue
+            name = item.get("schemeName") or ""
+            time.sleep(SLEEP)
+            meta = _fetch_nav_meta(str(code))
+            if not meta:
+                continue
+            cat = meta.get("scheme_category") or ""
+            if require_index_category and not _is_index_scheme(cat, name):
+                continue
+            seen_codes.add(code)
+            category = _normalize_category(cat)
+            benchmark = _infer_benchmark(meta.get("scheme_name") or name)
+            out.append({
+                "scheme_code": str(code),
+                "scheme_name": meta.get("scheme_name") or name,
+                "category": category,
+                "benchmark_type": benchmark,
+            })
+            if verbose:
+                print(f"  + {meta.get('scheme_name', name)[:55]} | {category} | {benchmark}")
+    return out
+def write_fund_csv(rows: list[dict], path: str | Path) -> None:
+    """Write CSV with FUND_CSV_HEADERS; each row is a dict with those keys (blank = '')."""
+    path = Path(path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with open(path, "w", encoding="utf-8-sig", newline="") as f:
+        w = csv.DictWriter(f, fieldnames=FUND_CSV_HEADERS, restval="", extrasaction="ignore")
+        w.writeheader()
+        w.writerows(rows)
+def build_csv_rows(schemes: list[dict], use_ps_format: bool = False) -> list[dict]:
+    """Convert discover output to CSV row dicts (metrics blank).
+    use_ps_format: when True, Fund = hyphenated name, Category = 'Index Fund' (matches raw CSV under PS).
+    """
+    rows = []
+    for s in schemes:
+        row = {h: "" for h in FUND_CSV_HEADERS}
+        name = s.get("scheme_name") or ""
+        row["Fund"] = _to_hyphenated(name) if use_ps_format else name.replace(",", " ")
+        row["Category"] = s.get("category") or ("Index Fund" if use_ps_format else "Equity: Index Fund")
+        row["Scheme Code"] = s.get("scheme_code") or ""
+        row["Benchmark Type"] = s.get("benchmark_type") or "Nifty 50"
+        rows.append(row)
+    return rows
+def run_ingest(
+    output_path: str | Path = "index_funds.csv",
+    source: str = "mftool",
+    search_queries: list[str] | None = None,
+    limit_per_query: int = 150,
+    verbose: bool = True,
+) -> tuple[list[dict], Path]:
+    """
+    Discover index schemes, build CSV rows, write CSV.
+    source: "mftool" = same as raw CSV (AMFI category 38, curated list). "mfapi" = search mfapi.
+    Returns (list of scheme dicts, output path).
+    """
+    if source.lower() == "mftool":
+        schemes = get_index_funds_via_mftool(verbose=verbose)
+        use_ps_format = True
+    else:
+        schemes = discover_index_schemes(
+            search_queries=search_queries,
+            limit_per_query=limit_per_query,
+            require_index_category=True,
+            verbose=verbose,
+        )
+        use_ps_format = False
+    rows = build_csv_rows(schemes, use_ps_format=use_ps_format)
+    out = Path(output_path)
+    write_fund_csv(rows, out)
+    if verbose:
+        print(f"\n[ingest] Wrote {len(rows)} rows to {out.absolute()} (source={source})")
+        print("  Next: run CSV enrichment on this file, then merge into main fund CSV.")
+    return schemes, out
+def main() -> None:
+    ap = argparse.ArgumentParser(
+        description="Index fund ingest — same list as raw CSV (mftool/AMFI) or mfapi search"
+    )
+    ap.add_argument("--output", "-o", default="index_funds.csv", help="Output CSV path")
+    ap.add_argument(
+        "--source",
+        choices=("mftool", "mfapi"),
+        default="mftool",
+        help="mftool = AMFI category 38 (same as raw CSV under PS). mfapi = search (more schemes).",
+    )
+    ap.add_argument("--search", "-s", action="append", default=None,
+                    help="[mfapi only] Search query (repeatable). Default: Index, Index Fund, ...")
+    ap.add_argument("--limit", "-n", type=int, default=150,
+                    help="[mfapi only] Max schemes per search query")
+    ap.add_argument("--quiet", "-q", action="store_true", help="Less output")
+    args = ap.parse_args()
+    run_ingest(
+        output_path=args.output,
+        source=args.source,
+        search_queries=args.search,
+        limit_per_query=args.limit,
+        verbose=not args.quiet,
+    )
+if __name__ == "__main__":
+    main()

src/models.py ADDED Viewed

	@@ -0,0 +1,132 @@

+"""
+Data models for MF Portfolio Analysis Tool.
+"""
+from dataclasses import dataclass, field
+from typing import Optional
+@dataclass
+class Fund:
+    """Represents a single mutual fund scheme from the fund universe CSV."""
+    name: str
+    category: str
+    benchmark: str
+    # Cost
+    ter: Optional[float] = None          # Total Expense Ratio (%)
+    turnover: Optional[float] = None     # Portfolio Turnover (%)
+    # Returns
+    mean: Optional[float] = None
+    cagr_1y: Optional[float] = None
+    cagr_1y_cat: Optional[float] = None
+    cagr_1y_bm: Optional[float] = None
+    cagr_3y: Optional[float] = None
+    cagr_3y_cat: Optional[float] = None
+    cagr_3y_bm: Optional[float] = None
+    cagr_5y: Optional[float] = None
+    cagr_5y_cat: Optional[float] = None
+    cagr_5y_bm: Optional[float] = None
+    cagr_10y: Optional[float] = None
+    cagr_10y_cat: Optional[float] = None
+    cagr_10y_bm: Optional[float] = None
+    cagr_inception: Optional[float] = None
+    nav: Optional[float] = None
+    # Valuation
+    pe_ratio: Optional[float] = None
+    pb_ratio: Optional[float] = None
+    # Risk metrics
+    alpha: Optional[float] = None
+    beta: Optional[float] = None
+    std_dev: Optional[float] = None
+    sharpe: Optional[float] = None
+    volatility: Optional[float] = None
+    sortino: Optional[float] = None
+    up_capture: Optional[float] = None
+    down_capture: Optional[float] = None
+    max_drawdown: Optional[float] = None
+    r_squared: Optional[float] = None
+    info_ratio: Optional[float] = None
+    aum: Optional[float] = None
+    fill_status: Optional[str] = None
+    # Scoring (computed)
+    score: Optional[float] = None
+    rank_in_category: Optional[int] = None
+    is_top_quartile: bool = False
+    weightage: Optional[int] = None  # Number of periods beating benchmark
+    order: int = 0  # Preserves original CSV insertion order for sort tiebreaker
+@dataclass
+class ClientHolding:
+    """Represents a single mutual fund holding in a client's portfolio."""
+    scheme_name: str
+    current_value: float
+    invested_amount: Optional[float] = None
+    sip_amount: Optional[float] = None
+    sip_frequency: Optional[str] = None   # Monthly / Quarterly etc.
+    # Matched fund data
+    fund: Optional[Fund] = None
+    # Computed
+    allocation_pct: float = 0.0
+    xirr: Optional[float] = None
+    is_underperforming: bool = False
+    # Advisory
+    suggested_fund: Optional[Fund] = None
+    switch_reason: Optional[str] = None
+@dataclass
+class Client:
+    """Client details."""
+    name: str
+    age: Optional[int] = None
+    email: Optional[str] = None
+    mobile: Optional[str] = None
+    pan: Optional[str] = None
+@dataclass
+class Advisor:
+    """Financial advisor details."""
+    name: str = "RAVICHANDRAN"
+    phone: str = "9281364703"
+    email: str = "c4c.ravi@gmail.com"
+    arn: str = "ARN-243354"
+    location: str = "Chennai"
+@dataclass
+class PortfolioReport:
+    """The complete portfolio analysis report for a client."""
+    client: Client
+    advisor: Advisor
+    holdings: list = field(default_factory=list)
+    # Portfolio-level metrics
+    total_current_value: float = 0.0
+    total_invested: float = 0.0
+    unrealized_gain: float = 0.0
+    portfolio_xirr: Optional[float] = None
+    sharpe: Optional[float] = None
+    alpha: Optional[float] = None
+    beta: Optional[float] = None
+    std_dev: Optional[float] = None
+    # Exposure warnings
+    amc_exposure: dict = field(default_factory=dict)      # AMC -> pct
+    scheme_exposure: dict = field(default_factory=dict)   # scheme -> pct
+    exposure_warnings: list = field(default_factory=list) # list of warning strings
+    # Allocation
+    market_cap_allocation: dict = field(default_factory=dict)  # Large/Mid/Small/Other -> pct
+    sector_allocation: dict = field(default_factory=dict)      # sector -> pct
+    # Wealth projection
+    wealth_projection: dict = field(default_factory=dict)  # years -> projected value

src/nav_metrics_engine.py ADDED Viewed

	@@ -0,0 +1,1005 @@

+from __future__ import annotations
+import json
+import sqlite3
+import threading
+import time
+from dataclasses import dataclass, field
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+import numpy as np
+import pandas as pd
+import requests
+import yfinance as yf
+TRADING_DAYS = 252
+RF_RATE = 0.06
+TRAILING_YEARS = 3
+NAV_STALE_DAYS = 30
+# ── Disk cache config ─────────────────────────────────────────────────────────
+# NAV history is refreshed if older than 7 days; benchmark index once a day.
+_CACHE_DB_PATH  = Path.home() / ".mf_nav_cache.db"
+_NAV_TTL_SECS   = 7 * 86_400   # 7 days
+_BENCH_TTL_SECS = 1 * 86_400   # 1 day
+_DB_LOCK        = threading.Lock()   # one writer at a time across threads
+OUTPUT_METRICS: tuple[str, ...] = (
+    "Alpha",
+    "Beta",
+    "Standard Deviation",
+    "Volatility",
+    "Mean",
+    "Sharpe Ratio",
+    "Sortino Ratio",
+    "Up Market Capture\nRatio",
+    "Down Market Capture\nRatio",
+    "Maximum Drawdown",
+    "R-Squared",
+    "Information Ratio",
+)
+NAV_ONLY_METRICS: set[str] = {
+    "Standard Deviation",
+    "Volatility",
+    "Mean",
+    "Sharpe Ratio",
+    "Sortino Ratio",
+    "Maximum Drawdown",
+}
+BENCHMARK_DEPENDENT_METRICS: set[str] = {
+    "Alpha",
+    "Beta",
+    "Up Market Capture\nRatio",
+    "Down Market Capture\nRatio",
+    "R-Squared",
+    "Information Ratio",
+}
+# Common Indian benchmark labels -> Yahoo Finance ticker
+# Last verified: March 2026
+# ^NIFTYJR was delisted — correct ticker for Nifty Next 50 is now ^NSMIDCP
+BENCHMARK_MAP: dict[str, str] = {
+    # ── Nifty broad indices ────────────────────────────────────────────────
+    "nifty 50": "^NSEI",
+    "nifty50": "^NSEI",
+    "nifty 50 tri": "^NSEI",
+    "nifty next 50": "^NSMIDCP",
+    "nifty next 50 tri": "^NSMIDCP",
+    "nifty junior": "^NSMIDCP",
+    "nifty 100": "^CNX100",
+    "nifty 100 tri": "^CNX100",
+    "nifty 100 (tri)": "^CNX100",
+    "nifty 200": "^CNX200",
+    "nifty 500": "^CRSLDX",
+    "nifty 500 tri": "^CRSLDX",
+    "nifty 500 (tri)": "^CRSLDX",
+    "nifty500": "^CRSLDX",
+    "nifty500 multicap 50:25:25 tri": "NIFTY500_MULTICAP_50_25_25.NS",
+    "nifty500 multicap 50:25:25 (tri)": "NIFTY500_MULTICAP_50_25_25.NS",
+    "nifty 500 multicap 50:25:25 (tri)": "NIFTY500_MULTICAP_50_25_25.NS",
+    "nifty500 multicap momentum quality 50 tri": "NIFTY500_MULTICAP_50_25_25.NS",
+    # ── Nifty midcap / smallcap ────────────────────────────────────────────
+    "nifty midcap 150": "NIFTY_MIDCAP_100.NS",
+    "nifty midcap 150 tri": "NIFTY_MIDCAP_100.NS",
+    "nifty midcap 150 index (tri)": "NIFTY_MIDCAP_100.NS",
+    "nifty midcap 100": "NIFTY_MIDCAP_100.NS",
+    "nifty midcap 50": "^NSEMDCP50",
+    "nifty midcap": "NIFTY_MIDCAP_100.NS",
+    "nifty large midcap 250 tri": "NIFTY_LARGEMIDCAP_250.NS",
+    "nifty large midcap 250": "NIFTY_LARGEMIDCAP_250.NS",
+    "nifty large - midcap 250 index": "NIFTY_LARGEMIDCAP_250.NS",
+    "nifty large - midcap 250": "NIFTY_LARGEMIDCAP_250.NS",
+    "nifty smallcap 250": "NIFTYSMLCAP250.NS",
+    "nifty smallcap 250 tri": "NIFTYSMLCAP250.NS",
+    "nifty small cap 250 (tri)": "NIFTYSMLCAP250.NS",
+    "nifty smallcap 100": "^CNXSC",
+    "nifty smallcap": "NIFTYSMLCAP250.NS",
+    # ── BSE ───────────────────────────────────────────────────────────────
+    "sensex": "^BSESN",
+    "bse sensex": "^BSESN",
+    "bse 100": "^BSE100",
+    "bse 200": "^BSE100",
+    "bse 500": "^BSE500",
+    "s&p bse liquid rate index": "^NSEI",   # no direct Yahoo ticker; use Nifty as proxy
+    # ── Sector / thematic ─────────────────────────────────────────────────
+    "nifty bank": "^NSEBANK",
+    "nifty bank tri": "^NSEBANK",
+    "nifty bank (tri)": "^NSEBANK",
+    "nifty private bank": "NIFTY_PVT_BANK.NS",
+    "nifty private bank tri": "NIFTY_PVT_BANK.NS",
+    "nifty it": "^CNXIT",
+    "nifty it tri": "^CNXIT",
+    "nifty financial services": "NIFTY_FIN_SERVICE.NS",
+    "nifty financial services tri": "NIFTY_FIN_SERVICE.NS",
+    "nifty financial services index (tri)": "NIFTY_FIN_SERVICE.NS",
+    "nifty financial services ex-bank tri": "NIFTY_FIN_SERVICE.NS",
+    "nifty pharma": "^CNXPHARMA",
+    "nifty pharma tri": "^CNXPHARMA",
+    "nifty healthcare": "NIFTY_HEALTHCARE.NS",
+    "nifty healthcare tri": "NIFTY_HEALTHCARE.NS",
+    "nifty healthcare tri.": "NIFTY_HEALTHCARE.NS",   # trailing dot variant
+    "nifty fmcg": "^CNXFMCG",
+    "nifty fmcg tri": "^CNXFMCG",
+    "nifty infrastructure": "^CNXINFRA",
+    "nifty infrastructure tri": "^CNXINFRA",
+    "nifty india consumption": "NIFTY_INDIA_CONSUMPTION.NS",
+    "nifty india consumption tri": "NIFTY_INDIA_CONSUMPTION.NS",
+    "nifty india consumption index (tri)": "NIFTY_INDIA_CONSUMPTION.NS",
+    "nifty india manufacturing tri": "NIFTY_INDIA_MANUFACTURING.NS",
+    "nifty india defence tri": "NIFTY_INDIA_DEFENCE.NS",
+    "nifty housing tri": "NIFTY_HOUSING.NS",
+    "nifty cpse tri": "NIFTY_CPSE.NS",
+    "nifty mnc tri": "NIFTY_MNC.NS",
+    "nifty commodities tri": "^CNXCMDT",
+    "nifty 100 esg tri": "NIFTY100_ESG.NS",
+    "nifty 100 low volatility 30 tri": "NIFTY100_LOWVOL30.NS",
+    "nifty ipo tri": "NIFTY_IPO.NS",
+    # ── Factor / strategy ─────────────────────────────────────────────────
+    "nifty 200 momentum 30 tri": "NIFTY200_MOMENTUM_30.NS",
+    # ── Debt / liquid / overnight — use Nifty 1D rate / GSec proxies ──────
+    "nifty 1d rate index": "^NSEI",         # overnight / liquid funds; no direct Yahoo
+    "nifty 1d rate": "^NSEI",
+    "crisil liquid overnight index": "^NSEI",
+    "nifty 3 year sdl": "^NSEI",
+    "nifty 4-8 yr g-sec index": "^NSEI",
+    "nifty composite g-sec index": "^NSEI",
+    # ── Hybrid / balanced ─────────────────────────────────────────────────
+    # AK = AdvisorKhoj composite benchmarks — no direct Yahoo ticker
+    # Mapped to closest equity index proxy based on fund category
+    "ak hybrid balanced tri": "^NSEI",          # Dynamic Asset Allocation → Nifty 50
+    "ak hybrid aggressive tri": "^NSEI",        # Aggressive Hybrid → Nifty 50
+    "ak hybrid conservative tri": "^NSEI",      # Conservative Hybrid → Nifty 50
+    "ak multi asset allocation tri": "^CRSLDX", # Multi Asset → Nifty 500
+    "ak equity savings tri": "^NSEI",           # Equity Savings → Nifty 50
+    # ── Global ────────────────────────────────────────────────────────────
+    "msci acwi tri": "URTH",                    # iShares MSCI ACWI ETF as proxy
+    "s&p global 1200 tri": "URTH",
+    "nifty 50 arbitrage index": "^NSEI",        # arbitrage funds; Nifty proxy
+}
+# ── Cache backend: SQLite (local) or Neon/Postgres (production) ──────────────
+#
+# Your Neon DSN (pooler endpoint — correct for serverless/HuggingFace):
+#   postgresql://neondb_owner:npg_b0JC5rvQlGPN@ep-damp-river-advc7q1j-pooler.c-2.us-east-1.aws.neon.tech/neondb?sslmode=require&channel_binding=require
+#
+# How to switch backends (zero code change needed):
+#
+#   LOCAL TESTING (SQLite, default — no setup):
+#     → Do NOT set DATABASE_URL in your local .env. Uses ~/.mf_nav_cache.db.
+#
+#   NEON / HUGGINGFACE SPACES:
+#     → Add to your .env  OR  HuggingFace Space Secret:
+#         DATABASE_URL=postgresql://neondb_owner:npg_b0JC5rvQlGPN@ep-damp-river-advc7q1j-pooler.c-2.us-east-1.aws.neon.tech/neondb?sslmode=require&channel_binding=require
+#     → Add to requirements.txt:
+#         psycopg2-binary
+#     → Done. Code detects DATABASE_URL and uses Neon automatically.
+#
+# WHY POOLER ENDPOINT (not direct):
+#   HuggingFace Spaces can spin up many workers concurrently.
+#   Pooler endpoint (ep-...-pooler.c-2...) handles connection bursts safely.
+#   Direct endpoint (ep-... without -pooler) has a hard cap of ~100 connections.
+#
+# WHY channel_binding=require:
+#   Your Neon project enforces channel binding. psycopg2 supports it via libpq >= 14.
+#   The param is passed through the DSN string — no extra code needed.
+#
+# Table schema (identical for SQLite and Postgres):
+#   nav_cache(key TEXT PRIMARY KEY, data TEXT NOT NULL, ts DOUBLE PRECISION NOT NULL)
+import os as _os
+_DATABASE_URL = _os.environ.get("DATABASE_URL", "")
+_USE_POSTGRES = bool(_DATABASE_URL)
+# ── Thread-local Postgres connection pool ─────────────────────────────────────
+# Opening a new psycopg2 connection per cache query costs ~100-200ms on Neon
+# (TLS handshake + auth). With 12 parallel workers × 2 queries/fund × 478 funds
+# that is ~1000 round-trips. Fix: one persistent connection per thread, reused
+# across all queries that thread handles.
+import threading as _threading
+_tls = _threading.local()
+def _get_pg_conn():
+    """
+    Return a thread-local persistent Neon connection, creating one if needed.
+    Falls back to a fresh connection if the cached one has gone away.
+    """
+    import psycopg2  # type: ignore
+    conn = getattr(_tls, "pg_conn", None)
+    if conn is not None:
+        try:
+            # Lightweight liveness check — closed flag or dead socket
+            if not conn.closed:
+                conn.cursor().execute("SELECT 1")
+                return conn
+        except Exception:
+            pass  # Connection is dead — fall through to re-create
+    conn = psycopg2.connect(
+        _DATABASE_URL,
+        connect_timeout=10,
+        keepalives=1,
+        keepalives_idle=30,
+        keepalives_interval=10,
+        keepalives_count=3,
+    )
+    _tls.pg_conn = conn
+    return conn
+def _init_cache_db() -> None:
+    """Create cache table if it doesn't exist (idempotent, works for both backends)."""
+    if _USE_POSTGRES:
+        try:
+            conn = _get_pg_conn()
+            with conn:
+                with conn.cursor() as cur:
+                    cur.execute("""
+                        CREATE TABLE IF NOT EXISTS nav_cache (
+                            key  TEXT PRIMARY KEY,
+                            data TEXT NOT NULL,
+                            ts   DOUBLE PRECISION NOT NULL
+                        )
+                    """)
+            conn.close()
+        except Exception as e:
+            print(f"[cache] Postgres init warning: {e}")
+    else:
+        with _DB_LOCK, sqlite3.connect(_CACHE_DB_PATH) as db:
+            db.execute("""
+                CREATE TABLE IF NOT EXISTS nav_cache (
+                    key  TEXT PRIMARY KEY,
+                    data TEXT NOT NULL,
+                    ts   REAL NOT NULL
+                )
+            """)
+            db.commit()
+def _cache_get(key: str, ttl: float) -> pd.DataFrame | None:
+    """Return cached DataFrame if fresh, else None. Works for SQLite and Neon."""
+    # Check bulk preload first — zero network cost
+    if key in _PRELOAD_CACHE:
+        return _PRELOAD_CACHE[key]
+    try:
+        if _USE_POSTGRES:
+            conn = _get_pg_conn()
+            with conn.cursor() as cur:
+                cur.execute(
+                    "SELECT data, ts FROM nav_cache WHERE key = %s", (key,)
+                )
+                row = cur.fetchone()
+            # Do NOT close — thread-local connection is reused
+        else:
+            with sqlite3.connect(_CACHE_DB_PATH) as db:
+                row = db.execute(
+                    "SELECT data, ts FROM nav_cache WHERE key = ?", (key,)
+                ).fetchone()
+        if row and (time.time() - row[1]) < ttl:
+            import io as _sio
+            return pd.read_json(_sio.StringIO(row[0]), orient="split")
+    except Exception:
+        pass
+    return None
+def _cache_set(key: str, df: pd.DataFrame) -> None:
+    """Persist DataFrame. Works for SQLite and Neon. Write failures are non-fatal."""
+    try:
+        serialised = df.to_json(orient="split", date_format="iso")
+        if _USE_POSTGRES:
+            conn = _get_pg_conn()
+            with conn.cursor() as cur:
+                cur.execute("""
+                    INSERT INTO nav_cache (key, data, ts)
+                    VALUES (%s, %s, %s)
+                    ON CONFLICT (key) DO UPDATE
+                        SET data = EXCLUDED.data,
+                            ts   = EXCLUDED.ts
+                """, (key, serialised, time.time()))
+            conn.commit()
+            # Do NOT close — thread-local connection is reused
+        else:
+            with _DB_LOCK, sqlite3.connect(_CACHE_DB_PATH) as db:
+                db.execute(
+                    "INSERT OR REPLACE INTO nav_cache (key, data, ts) VALUES (?, ?, ?)",
+                    (key, serialised, time.time()),
+                )
+                db.commit()
+    except Exception:
+        pass  # cache write failure is non-fatal
+# Initialise at import time (fast, idempotent).
+try:
+    _init_cache_db()
+except Exception:
+    pass
+# ── In-process cache (lives for the duration of one run) ─────────────────────
+@dataclass
+class NavEngineCache:
+    """
+    Two-level cache:
+      L1 — in-process dict (zero latency within a run, thread-safe via dict GIL)
+      L2 — SQLite on disk (persists across runs; TTL-based)
+    """
+    nav_history:       dict[str, pd.DataFrame | None] = field(default_factory=dict)
+    benchmark_history: dict[str, pd.DataFrame | None] = field(default_factory=dict)
+    _lock: threading.Lock = field(default_factory=threading.Lock, repr=False)
+def _normalize_benchmark_name(name: str) -> str:
+    return " ".join((name or "").lower().replace("-", " ").replace("_", " ").split())
+def resolve_benchmark_ticker(benchmark_type: str) -> str:
+    # Guard against corrupt scraper artifacts (Java object toString strings)
+    raw = (benchmark_type or "").strip()
+    if raw.startswith("com.") or "@" in raw:
+        return "^NSEI"  # fallback for corrupt benchmark strings
+    normalized = _normalize_benchmark_name(raw)
+    if not normalized:
+        return "^NSEI"
+    if normalized in BENCHMARK_MAP:
+        ticker = BENCHMARK_MAP[normalized]
+    else:
+        ticker = "^NSEI"
+        for key, t in BENCHMARK_MAP.items():
+            if key in normalized:
+                ticker = t
+                break
+    # Second-level fallback: some NSE index tickers resolve from BENCHMARK_MAP
+    # but are not available on yfinance (delisted/unavailable symbols).
+    # Map them to the nearest available proxy so _prewarm_benchmarks doesn't fail.
+    _YF_UNAVAILABLE: dict[str, str] = {
+        "NIFTY_CPSE.NS":                 "^NSEI",    # PSU index → broad market
+        "NIFTYSMLCAP250.NS":             "^CNXSC",   # Smallcap 250 → Smallcap 100
+        "NIFTY_IPO.NS":                  "^NSEI",    # IPO index → no yf equivalent
+        "NIFTY200_MOMENTUM_30.NS":       "^NSEI",    # momentum factor → broad market
+        "NIFTY_HOUSING.NS":              "^NSEI",
+        "NIFTY_LARGEMIDCAP_250.NS":      "^NSEI",
+        "NIFTY_INDIA_CONSUMPTION.NS":    "^NSEI",
+        "NIFTY_HEALTHCARE.NS":           "^NSEI",
+        "NIFTY100_ESG.NS":               "^NSEI",
+        "NIFTY100_LOWVOL30.NS":          "^NSEI",
+        "NIFTY_MNC.NS":                  "^NSEI",
+        "NIFTY_INDIA_MANUFACTURING.NS":  "^NSEI",
+        "NIFTY500_MULTICAP_50_25_25.NS": "^NSEI",
+    }
+    return _YF_UNAVAILABLE.get(ticker, ticker)
+def _safe_float(value: Any) -> float | None:
+    if value is None:
+        return None
+    text = str(value).strip().replace(",", "")
+    if text in {"", "-", "—", "N/A", "N/A*", "na", "nan", "None"}:
+        return None
+    try:
+        return float(text)
+    except ValueError:
+        return None
+def _request_json_with_retries(
+    url: str, max_retries: int = 3, timeout: int = 20
+) -> dict[str, Any] | None:
+    for attempt in range(1, max_retries + 1):
+        try:
+            resp = requests.get(url, timeout=timeout)
+            resp.raise_for_status()
+            return resp.json()
+        except Exception:
+            if attempt == max_retries:
+                return None
+    return None
+# ── Bulk preload cache ────────────────────────────────────────────────────────
+# Populated once before parallel workers start. _cache_get checks here first,
+# avoiding per-fund Neon round-trips on warm cache runs.
+_PRELOAD_CACHE: dict[str, "pd.DataFrame"] = {}
+def _bulk_preload_cache(scheme_codes: list[str], benchmark_tickers: list[str]) -> None:
+    """
+    Load ALL nav + benchmark entries from Neon in 2 SQL queries.
+    Call once before ThreadPoolExecutor starts — cuts Neon queries from ~766 to 2.
+    SQLite is local/fast so skipped.
+    """
+    import io as _sio
+    global _PRELOAD_CACHE
+    if not _USE_POSTGRES:
+        return
+    nav_keys   = [f"nav:{c}"   for c in scheme_codes        if c]
+    bench_keys = [f"bench:{t}" for t in benchmark_tickers   if t]
+    all_keys   = nav_keys + bench_keys
+    if not all_keys:
+        return
+    try:
+        conn = _get_pg_conn()
+        now  = time.time()
+        placeholders = ",".join(["%s"] * len(all_keys))
+        with conn.cursor() as cur:
+            cur.execute(
+                f"SELECT key, data, ts FROM nav_cache WHERE key IN ({placeholders})",
+                all_keys,
+            )
+            rows_fetched = cur.fetchall()
+        loaded_nav = loaded_bench = 0
+        for key, data, ts in rows_fetched:
+            ttl = _NAV_TTL_SECS if key.startswith("nav:") else _BENCH_TTL_SECS
+            if (now - ts) >= ttl:
+                continue
+            try:
+                df = pd.read_json(_sio.StringIO(data), orient="split")
+                # Normalise dates — JSON round-trip strips tz info
+                if "date" in df.columns:
+                    df["date"] = pd.to_datetime(df["date"]).dt.tz_localize(None).dt.normalize()
+            except Exception:
+                continue
+            _PRELOAD_CACHE[key] = df
+            if key.startswith("nav:"):
+                loaded_nav += 1
+            else:
+                loaded_bench += 1
+        print(f"[cache] Bulk preload: {loaded_nav} NAV + {loaded_bench} benchmark entries from Neon")
+    except Exception as e:
+        print(f"[cache] Bulk preload failed (falling back to per-query): {e}")
+def _prewarm_benchmarks(benchmark_tickers: list[str]) -> None:
+    """
+    Download all unique benchmark tickers in parallel BEFORE workers start.
+    Complexity: O(B) time where B = unique benchmarks (68 in production).
+    Each already-cached ticker hits _PRELOAD_CACHE in O(1) — zero network.
+    Each cold ticker downloads once via yfinance and is stored in Neon + _PRELOAD_CACHE.
+    Workers then get O(1) cache hits for all benchmark lookups.
+    """
+    from concurrent.futures import ThreadPoolExecutor, as_completed
+    unique = list(dict.fromkeys(t for t in benchmark_tickers if t))  # preserve order, dedup
+    if not unique:
+        return
+    # Filter: only fetch tickers not already in preload cache
+    cold = [t for t in unique if f"bench:{t}" not in _PRELOAD_CACHE]
+    warm = len(unique) - len(cold)
+    if warm:
+        print(f"[bench-prewarm] {warm}/{len(unique)} already in cache")
+    if not cold:
+        return
+    print(f"[bench-prewarm] Downloading {len(cold)} cold benchmark tickers in parallel…")
+    def _fetch_one(ticker: str) -> tuple[str, bool]:
+        df = _fetch_benchmark_history(ticker)  # handles cache_set + _PRELOAD_CACHE population
+        return ticker, df is not None
+    ok = failed = 0
+    with ThreadPoolExecutor(max_workers=min(len(cold), 20)) as ex:
+        futures = {ex.submit(_fetch_one, t): t for t in cold}
+        for fut in as_completed(futures):
+            ticker, success = fut.result()
+            if success:
+                ok += 1
+            else:
+                failed += 1
+                print(f"  [bench-prewarm] WARN: could not fetch {ticker}")
+    print(f"[bench-prewarm] Done: {ok} fetched, {failed} failed, {warm} from cache")
+def _fetch_nav_history(scheme_code: str) -> pd.DataFrame | None:
+    """Fetch from Neon cache first, then mfapi."""
+    cache_key = f"nav:{scheme_code}"
+    cached = _cache_get(cache_key, _NAV_TTL_SECS)
+    if cached is not None:
+        return cached
+    url = f"https://api.mfapi.in/mf/{scheme_code}"
+    payload = _request_json_with_retries(url)
+    if not payload or "data" not in payload:
+        return None
+    try:
+        nav_df = pd.DataFrame(payload["data"])
+        if nav_df.empty or "date" not in nav_df or "nav" not in nav_df:
+            return None
+        nav_df["date"] = pd.to_datetime(nav_df["date"], dayfirst=True, errors="coerce").dt.tz_localize(None).dt.normalize()
+        nav_df["nav"]  = pd.to_numeric(nav_df["nav"], errors="coerce")
+        nav_df = nav_df.dropna(subset=["date", "nav"]).sort_values("date")
+        if nav_df.empty:
+            return None
+        df = nav_df[["date", "nav"]]
+        _cache_set(cache_key, df)
+        return df
+    except Exception:
+        return None
+def _fetch_benchmark_history(ticker: str) -> pd.DataFrame | None:
+    """Fetch from disk cache (L2) first, then yfinance."""
+    cache_key = f"bench:{ticker}"
+    cached = _cache_get(cache_key, _BENCH_TTL_SECS)
+    if cached is not None:
+        return cached
+    df = _download_benchmark(ticker)
+    if df is not None:
+        _cache_set(cache_key, df)
+    return df
+def _download_benchmark(ticker: str) -> pd.DataFrame | None:
+    """
+    Raw yfinance download (no caching logic here).
+    Parallel workers hitting yfinance simultaneously can get 401 Invalid Crumb
+    errors because yfinance refreshes its session cookie lazily. Fix:
+      - Retry up to 4 times with exponential backoff (0.5s, 1s, 2s)
+      - Each retry creates a fresh Ticker session, which re-fetches the crumb
+      - Suppress noisy 'possibly delisted' stderr from yfinance
+    """
+    import contextlib, io as _io
+    def _suppress_yf_stderr(fn, *args, **kwargs):
+        """Run fn suppressing yfinance's noisy stderr warnings."""
+        with contextlib.redirect_stderr(_io.StringIO()):
+            return fn(*args, **kwargs)
+    for attempt in range(4):
+        if attempt > 0:
+            time.sleep(0.5 * (2 ** (attempt - 1)))  # 0.5s, 1s, 2s
+        try:
+            bench = _suppress_yf_stderr(
+                yf.download,
+                ticker,
+                start="2000-01-01",
+                progress=False,
+                auto_adjust=False,
+                threads=False,
+            )
+            if bench is None or bench.empty:
+                continue
+            if isinstance(bench.columns, pd.MultiIndex):
+                bench.columns = [str(col[0]) for col in bench.columns]
+            bench = bench.reset_index()
+            price_col = "Adj Close" if "Adj Close" in bench.columns else "Close"
+            if price_col not in bench.columns:
+                continue
+            bench = bench[["Date", price_col]].rename(
+                columns={"Date": "date", price_col: "benchmark"}
+            )
+            bench["date"]      = pd.to_datetime(bench["date"], errors="coerce").dt.tz_localize(None).dt.normalize()
+            bench["benchmark"] = pd.to_numeric(bench["benchmark"], errors="coerce")
+            bench = bench.dropna(subset=["date", "benchmark"]).sort_values("date")
+            if len(bench) >= 60:
+                return bench
+        except Exception:
+            continue
+    # Secondary fallback: Ticker().history() uses a separate session/crumb path
+    for attempt in range(3):
+        if attempt > 0:
+            time.sleep(0.5 * attempt)
+        try:
+            hist = _suppress_yf_stderr(
+                yf.Ticker(ticker).history,
+                period="10y",
+                auto_adjust=False,
+            )
+            if hist is None or hist.empty:
+                continue
+            hist = hist.reset_index()
+            price_col = "Adj Close" if "Adj Close" in hist.columns else "Close"
+            if price_col not in hist.columns:
+                continue
+            hist = hist[["Date", price_col]].rename(
+                columns={"Date": "date", price_col: "benchmark"}
+            )
+            hist["date"]      = pd.to_datetime(hist["date"], errors="coerce").dt.tz_localize(None).dt.normalize()
+            hist["benchmark"] = pd.to_numeric(hist["benchmark"], errors="coerce")
+            hist = hist.dropna(subset=["date", "benchmark"]).sort_values("date")
+            if len(hist) >= 60:
+                return hist
+        except Exception:
+            continue
+    return None
+def _trailing_3y_window(df: pd.DataFrame) -> pd.DataFrame:
+    if df.empty:
+        return df
+    max_date = df["date"].max()
+    if pd.isna(max_date):
+        return pd.DataFrame(columns=df.columns)
+    cutoff = max_date - pd.DateOffset(years=TRAILING_YEARS)
+    return df[df["date"] >= cutoff].copy()
+def _nav_history_is_stale(nav_df: pd.DataFrame) -> bool:
+    if nav_df is None or nav_df.empty or "date" not in nav_df.columns:
+        return True
+    latest = pd.to_datetime(nav_df["date"], errors="coerce").max()
+    if pd.isna(latest):
+        return True
+    latest = pd.Timestamp(latest).tz_localize(None).normalize()
+    cutoff = pd.Timestamp.now().tz_localize(None).normalize() - pd.Timedelta(days=NAV_STALE_DAYS)
+    return latest < cutoff
+def _compute_nav_only_metrics(
+    nav_df: pd.DataFrame,
+    needed_metrics: list[str],
+    benchmark_reason: str,
+) -> tuple[dict[str, float | None], dict[str, str]]:
+    needed = [m for m in needed_metrics if m in OUTPUT_METRICS]
+    out    = {m: None for m in needed}
+    skip: dict[str, str] = {}
+    if not needed:
+        return out, skip
+    for m in needed:
+        if m in BENCHMARK_DEPENDENT_METRICS:
+            skip[m] = benchmark_reason
+    window = _trailing_3y_window(nav_df[["date", "nav"]].copy())
+    if window.empty:
+        for m in needed:
+            if m in NAV_ONLY_METRICS:
+                skip[m] = "less than 3 years of NAV history"
+        return out, skip
+    returns = window["nav"].pct_change().dropna()
+    if len(returns) < 30:
+        for m in needed:
+            if m in NAV_ONLY_METRICS:
+                skip[m] = f"fewer than 30 NAV return points ({len(returns)})"
+        return out, skip
+    mean_daily  = returns.mean()
+    mean_annual = mean_daily * TRADING_DAYS
+    vol         = returns.std(ddof=1) * np.sqrt(TRADING_DAYS)
+    if pd.notna(vol):
+        if "Standard Deviation" in out:
+            out["Standard Deviation"] = float(vol * 100)
+        if "Volatility" in out:
+            out["Volatility"] = float(vol * 100)
+    if "Mean" in out and pd.notna(mean_annual):
+        out["Mean"] = float(mean_annual * 100)
+    if "Sharpe Ratio" in out:
+        if pd.notna(vol) and vol > 0:
+            sharpe = (mean_annual - RF_RATE) / vol
+            out["Sharpe Ratio"] = float(sharpe) if pd.notna(sharpe) else None
+        if out["Sharpe Ratio"] is None:
+            skip["Sharpe Ratio"] = "volatility is zero or NaN (NAV-only fallback)"
+    if "Sortino Ratio" in out:
+        downside = returns[returns < 0]
+        if not downside.empty:
+            downside_std = downside.std(ddof=1) * np.sqrt(TRADING_DAYS)
+            if pd.notna(downside_std) and downside_std > 0:
+                sortino = (mean_annual - RF_RATE) / downside_std
+                out["Sortino Ratio"] = float(sortino) if pd.notna(sortino) else None
+            elif out.get("Sharpe Ratio") is not None:
+                out["Sortino Ratio"] = float(out["Sharpe Ratio"])
+        elif out.get("Sharpe Ratio") is not None:
+            out["Sortino Ratio"] = float(out["Sharpe Ratio"])
+        if out["Sortino Ratio"] is None:
+            skip["Sortino Ratio"] = "no valid downside deviation (NAV-only fallback)"
+    if "Maximum Drawdown" in out:
+        cumulative = (1 + returns).cumprod()
+        peak       = cumulative.cummax()
+        drawdown   = (cumulative - peak) / peak
+        if not drawdown.empty:
+            max_drawdown = drawdown.min()
+            out["Maximum Drawdown"] = (
+                float(max_drawdown * 100) if pd.notna(max_drawdown) else None
+            )
+        if out["Maximum Drawdown"] is None:
+            skip["Maximum Drawdown"] = "unable to compute NAV-only drawdown"
+    return out, skip
+def _compute_metrics(
+    returns_df: pd.DataFrame,
+) -> tuple[dict[str, float | None], dict[str, str]]:
+    skip: dict[str, str] = {}
+    if returns_df.empty:
+        for k in OUTPUT_METRICS:
+            skip[k] = "empty returns dataframe after merge/window"
+        return {k: None for k in OUTPUT_METRICS}, skip
+    fund  = returns_df["fund_return"]
+    bench = returns_df["benchmark_return"]
+    result: dict[str, float | None] = {k: None for k in OUTPUT_METRICS}
+    if len(fund) < 30:
+        for k in OUTPUT_METRICS:
+            skip[k] = f"fewer than 30 data points ({len(fund)}) after join"
+        return result, skip
+    mean_daily       = fund.mean()
+    bench_mean_daily = bench.mean()
+    mean_annual      = mean_daily * TRADING_DAYS
+    bench_annual     = bench_mean_daily * TRADING_DAYS
+    vol = fund.std(ddof=1) * np.sqrt(TRADING_DAYS)
+    if pd.notna(vol):
+        result["Standard Deviation"] = float(vol * 100)
+        result["Volatility"] = float(vol * 100)
+    result["Mean"] = float(mean_annual * 100) if pd.notna(mean_annual) else None
+    bench_var = bench.var(ddof=1)
+    beta      = None
+    if pd.notna(bench_var) and bench_var and bench_var > 0:
+        cov  = np.cov(fund, bench)[0, 1]
+        beta = cov / bench_var
+    result["Beta"] = float(beta) if beta is not None and pd.notna(beta) else None
+    if result["Beta"] is None:
+        skip["Beta"] = (
+            "benchmark variance is zero or NaN"
+            if not (pd.notna(bench_var) and bench_var and bench_var > 0)
+            else "beta computation returned NaN"
+        )
+    if beta is not None and pd.notna(mean_annual):
+        alpha = mean_annual - (RF_RATE + beta * (bench_annual - RF_RATE))
+        result["Alpha"] = float(alpha * 100) if pd.notna(alpha) else None
+    if result["Alpha"] is None:
+        skip["Alpha"] = (
+            "Beta is None — Alpha requires Beta"
+            if result["Beta"] is None
+            else "Alpha computation returned NaN"
+        )
+    if vol and vol > 0:
+        sharpe = (mean_annual - RF_RATE) / vol
+        result["Sharpe Ratio"] = float(sharpe) if pd.notna(sharpe) else None
+    if result["Sharpe Ratio"] is None:
+        skip["Sharpe Ratio"] = "volatility is zero or NaN"
+    downside = fund[fund < 0]
+    if not downside.empty:
+        downside_std = downside.std(ddof=1) * np.sqrt(TRADING_DAYS)
+        if pd.notna(downside_std) and downside_std > 0:
+            sortino = (mean_annual - RF_RATE) / downside_std
+            result["Sortino Ratio"] = float(sortino) if pd.notna(sortino) else None
+        elif result["Sharpe Ratio"] is not None:
+            result["Sortino Ratio"] = float(result["Sharpe Ratio"])
+        else:
+            skip["Sortino Ratio"] = "downside std dev is zero and Sharpe fallback unavailable"
+    elif result["Sharpe Ratio"] is not None:
+        result["Sortino Ratio"] = float(result["Sharpe Ratio"])
+    else:
+        skip["Sortino Ratio"] = (
+            "no negative daily returns in 3Y window and Sharpe fallback unavailable"
+        )
+    cumulative = (1 + fund).cumprod()
+    peak       = cumulative.cummax()
+    drawdown   = (cumulative - peak) / peak
+    if not drawdown.empty:
+        max_drawdown = drawdown.min()
+        result["Maximum Drawdown"] = (
+            float(max_drawdown * 100) if pd.notna(max_drawdown) else None
+        )
+    corr = fund.corr(bench)
+    if pd.notna(corr):
+        result["R-Squared"] = float(corr ** 2)
+    else:
+        skip["R-Squared"] = "fund/benchmark correlation is NaN"
+    active          = fund - bench
+    tracking_error  = active.std(ddof=1) * np.sqrt(TRADING_DAYS)
+    if pd.notna(tracking_error) and tracking_error > 0:
+        info_ratio = (mean_annual - bench_annual) / tracking_error
+        result["Information Ratio"] = (
+            float(info_ratio) if pd.notna(info_ratio) else None
+        )
+    else:
+        skip["Information Ratio"] = (
+            "tracking error is zero — fund mirrors benchmark"
+            if (pd.notna(tracking_error) and tracking_error == 0)
+            else "tracking error is NaN"
+        )
+    up = returns_df[returns_df["benchmark_return"] > 0]
+    if not up.empty:
+        up_bench = up["benchmark_return"].mean()
+        if pd.notna(up_bench) and up_bench != 0:
+            up_capture = (up["fund_return"].mean() / up_bench) * 100
+            result["Up Market Capture\nRatio"] = (
+                float(up_capture) if pd.notna(up_capture) else None
+            )
+        else:
+            skip["Up Market Capture\nRatio"] = "benchmark mean on up-days is zero or NaN"
+    else:
+        skip["Up Market Capture\nRatio"] = "no benchmark up-days in 3Y window"
+    down = returns_df[returns_df["benchmark_return"] < 0]
+    if not down.empty:
+        down_bench = down["benchmark_return"].mean()
+        if pd.notna(down_bench) and down_bench != 0:
+            down_capture = (down["fund_return"].mean() / down_bench) * 100
+            result["Down Market Capture\nRatio"] = (
+                float(down_capture) if pd.notna(down_capture) else None
+            )
+        else:
+            skip["Down Market Capture\nRatio"] = "benchmark mean on down-days is zero or NaN"
+    else:
+        skip["Down Market Capture\nRatio"] = "no benchmark down-days in 3Y window"
+    return result, skip
+def compute_nav_metrics_for_scheme(
+    *,
+    scheme_code:    str,
+    benchmark_type: str,
+    needed_metrics: list[str],
+    cache:          NavEngineCache,
+) -> tuple[dict[str, float | None], dict[str, str]]:
+    """
+    Compute trailing-3Y risk metrics for a scheme.
+    Thread-safe: uses NavEngineCache._lock to serialise L1 dict writes so
+    concurrent ThreadPoolExecutor workers don't race on the same key.
+    """
+    needed = [m for m in needed_metrics if m in OUTPUT_METRICS]
+    if not needed:
+        return {}, {}
+    code = str(scheme_code or "").strip()
+    if not code:
+        reason = "no scheme code — category header or unresolved scheme"
+        return {m: None for m in needed}, {m: reason for m in needed}
+    # ── NAV history (L1 check then L2 fetch) ──────────────────────────────
+    with cache._lock:
+        if code not in cache.nav_history:
+            cache.nav_history[code] = None   # sentinel prevents duplicate fetches
+    nav_df = cache.nav_history.get(code)
+    if nav_df is None and cache.nav_history.get(code) is None:
+        fetched = _fetch_nav_history(code)
+        with cache._lock:
+            cache.nav_history[code] = fetched
+        nav_df = fetched
+    elif nav_df is None:
+        nav_df = _fetch_nav_history(code)
+        with cache._lock:
+            cache.nav_history[code] = nav_df
+    if nav_df is None or nav_df.empty:
+        reason = f"MFAPI returned no NAV history for scheme code {code}"
+        return {m: None for m in needed}, {m: reason for m in needed}
+    if _nav_history_is_stale(nav_df):
+        latest = pd.to_datetime(nav_df["date"], errors="coerce").max()
+        latest_str = (
+            pd.Timestamp(latest).tz_localize(None).normalize().strftime("%Y-%m-%d")
+            if pd.notna(latest) else "unknown"
+        )
+        reason = f"NAV history is stale for scheme code {code} (latest NAV {latest_str})"
+        return {m: None for m in needed}, {m: reason for m in needed}
+    # ── Benchmark history (L1 check then L2 fetch) ────────────────────────
+    ticker = resolve_benchmark_ticker(benchmark_type)
+    def _ensure_benchmark(t: str) -> pd.DataFrame | None:
+        with cache._lock:
+            if t not in cache.benchmark_history:
+                cache.benchmark_history[t] = None
+        bench = cache.benchmark_history.get(t)
+        if bench is None:
+            fetched_b = _fetch_benchmark_history(t)
+            with cache._lock:
+                cache.benchmark_history[t] = fetched_b
+            return fetched_b
+        return bench
+    bench_df = _ensure_benchmark(ticker)
+    if (bench_df is None or bench_df.empty or len(bench_df) < 60) and ticker != "^NSEI":
+        bench_df = _ensure_benchmark("^NSEI")
+    if bench_df is None or bench_df.empty:
+        reason = f"benchmark history unavailable for ticker={ticker} and NIFTY 50 fallback also failed"
+        return _compute_nav_only_metrics(nav_df, needed, reason)
+    # ── Merge + compute ───────────────────────────────────────────────────
+    # Strip tz from both sides — yfinance returns UTC-aware, JSON cache is naive
+    nav_df   = nav_df.copy()
+    bench_df = bench_df.copy()
+    nav_df["date"]   = pd.to_datetime(nav_df["date"]).dt.tz_localize(None).dt.normalize()
+    bench_df["date"] = pd.to_datetime(bench_df["date"]).dt.tz_localize(None).dt.normalize()
+    # Debt funds (Liquid, Overnight, Ultra Short etc.) publish NAV every calendar
+    # day including weekends/holidays, while equity benchmarks only publish on
+    # trading days.  A naive inner-join on date yields almost no matching rows
+    # (<30) causing all metrics to return None.
+    # Fix: forward-fill NAV to the benchmark's trading-day calendar so the merge
+    # always produces a full 3Y of matched rows regardless of fund type.
+    bench_dates = bench_df[["date"]].drop_duplicates().sort_values("date")
+    nav_reindexed = (
+        nav_df.set_index("date")
+              .reindex(bench_dates["date"])
+              .ffill()                       # carry last known NAV forward
+              .dropna()
+              .reset_index()
+              .rename(columns={"index": "date"})
+    )
+    merged = pd.merge(nav_reindexed, bench_df, on="date", how="inner")
+    if merged.empty:
+        reason = f"no overlapping dates between NAV (scheme={code}) and benchmark (ticker={ticker})"
+        return _compute_nav_only_metrics(nav_df, needed, reason)
+    merged = _trailing_3y_window(merged)
+    if merged.empty:
+        reason = f"less than 3 years of overlapping data for scheme={code}"
+        return {m: None for m in needed}, {m: reason for m in needed}
+    merged["fund_return"]      = merged["nav"].pct_change()
+    merged["benchmark_return"] = merged["benchmark"].pct_change()
+    merged = merged.dropna(subset=["fund_return", "benchmark_return"]).copy()
+    if merged.empty:
+        reason = "all rows dropped after computing benchmark-joined returns"
+        return _compute_nav_only_metrics(nav_df, needed, reason)
+    all_metrics, all_skip = _compute_metrics(merged)
+    metrics      = {m: all_metrics.get(m) for m in needed}
+    skip_reasons = {
+        m: all_skip[m]
+        for m in needed
+        if m in all_skip and metrics.get(m) is None
+    }
+    # Defensive top-up for NAV-only metrics
+    if any(m in NAV_ONLY_METRICS and metrics.get(m) is None for m in needed):
+        nav_only, nav_only_skip = _compute_nav_only_metrics(
+            nav_df, needed, "benchmark-dependent metric unavailable"
+        )
+        for m in needed:
+            if (
+                m in NAV_ONLY_METRICS
+                and metrics.get(m) is None
+                and nav_only.get(m) is not None
+            ):
+                metrics[m] = nav_only[m]
+                skip_reasons.pop(m, None)
+            elif (
+                metrics.get(m) is None
+                and m not in skip_reasons
+                and m in nav_only_skip
+            ):
+                skip_reasons[m] = nav_only_skip[m]
+    return metrics, skip_reasons

src/pdf_generator.py ADDED Viewed

	@@ -0,0 +1,560 @@

+"""
+PDF Generator: Produces the investor portfolio review PDF.
+Layout (matching the sample investor-portfolio-review PDF):
+  Page 1:
+    - Header (Advisor + Client details)
+    - Executive Summary (total value, gain, metrics)
+    - Holdings table (all schemes with score)
+    - Market Cap Allocation pie
+    - Sector Allocation bar
+  Page 2+:
+    - Per-scheme detail block (fund metrics vs top quartile vs benchmark)
+    - Underperforming flags
+    - Switch suggestion (if any)
+    - Capital gains estimate (if switch suggested)
+  Final Page:
+    - Wealth Projection chart
+    - Disclaimer
+"""
+import io
+import os
+from pathlib import Path
+from datetime import datetime
+from typing import Optional, List
+from reportlab.lib.pagesizes import A4
+from reportlab.lib import colors
+from reportlab.lib.units import mm
+from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
+from reportlab.lib.enums import TA_LEFT, TA_CENTER, TA_RIGHT
+from reportlab.platypus import (
+    SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle,
+    HRFlowable, PageBreak, Image, KeepTogether
+)
+from reportlab.platypus.flowables import Flowable
+from reportlab.graphics.shapes import Drawing, Rect, String
+import matplotlib
+matplotlib.use('Agg')
+from src.models import PortfolioReport, ClientHolding, Fund
+from src import charts as ch
+# ─── Theme ───────────────────────────────────────────────────────────────────
+DARK_BLUE   = colors.HexColor("#1F3864")
+MID_BLUE    = colors.HexColor("#2E75B6")
+LIGHT_BLUE  = colors.HexColor("#BDD7EE")
+GREEN       = colors.HexColor("#2ECC71")
+ORANGE      = colors.HexColor("#E67E22")
+RED         = colors.HexColor("#E74C3C")
+GREY_BG     = colors.HexColor("#F5F5F5")
+LIGHT_GREY  = colors.HexColor("#D9D9D9")
+WHITE       = colors.white
+BLACK       = colors.black
+W, H = A4
+MARGIN = 15 * mm
+styles = getSampleStyleSheet()
+def S(name, **kwargs):
+    """Quick style builder."""
+    return ParagraphStyle(name, parent=styles['Normal'], **kwargs)
+# ─── Style Definitions ────────────────────────────────────────────────────────
+STYLE_TITLE     = S("Title",     fontSize=18, textColor=DARK_BLUE, fontName="Helvetica-Bold",
+                     spaceAfter=2, alignment=TA_CENTER)
+STYLE_SUBTITLE  = S("Subtitle",  fontSize=9,  textColor=MID_BLUE, fontName="Helvetica",
+                     spaceAfter=4, alignment=TA_CENTER)
+STYLE_H1        = S("H1",        fontSize=11, textColor=DARK_BLUE, fontName="Helvetica-Bold",
+                     spaceAfter=3, spaceBefore=6)
+STYLE_H2        = S("H2",        fontSize=9,  textColor=DARK_BLUE, fontName="Helvetica-Bold",
+                     spaceAfter=2, spaceBefore=4)
+STYLE_BODY      = S("Body",      fontSize=8,  textColor=BLACK,
+                     spaceAfter=2)
+STYLE_SMALL     = S("Small",     fontSize=7,  textColor=colors.HexColor("#555555"),
+                     spaceAfter=1)
+STYLE_WARN      = S("Warn",      fontSize=8,  textColor=colors.HexColor("#C0392B"),
+                     fontName="Helvetica-Bold")
+STYLE_OK        = S("OK",        fontSize=8,  textColor=colors.HexColor("#27AE60"),
+                     fontName="Helvetica-Bold")
+STYLE_DISCLAIMER= S("Disc",      fontSize=6,  textColor=colors.HexColor("#666666"),
+                     spaceAfter=2, leading=8)
+def _fmt_inr(value: float) -> str:
+    """Format as Indian currency string."""
+    if value is None:
+        return "N/A"
+    if abs(value) >= 1e7:
+        return f"₹{value/1e7:.2f} Cr"
+    if abs(value) >= 1e5:
+        return f"₹{value/1e5:.2f} L"
+    return f"₹{value:,.0f}"
+def _fmt_pct(value: Optional[float], decimals: int = 2) -> str:
+    if value is None:
+        return "N/A"
+    return f"{value:.{decimals}f}%"
+def _fmt_float(value: Optional[float], decimals: int = 2) -> str:
+    if value is None:
+        return "N/A"
+    return f"{value:.{decimals}f}"
+def _img_from_buf(buf: io.BytesIO, width_mm: float, height_mm: float) -> Image:
+    """Create a ReportLab Image from a BytesIO buffer."""
+    img = Image(buf)
+    img.drawWidth  = width_mm * mm
+    img.drawHeight = height_mm * mm
+    return img
+def _table_style(header_color=DARK_BLUE, row_alt=GREY_BG):
+    return TableStyle([
+        ('BACKGROUND', (0, 0), (-1, 0), header_color),
+        ('TEXTCOLOR',  (0, 0), (-1, 0), WHITE),
+        ('FONTNAME',   (0, 0), (-1, 0), 'Helvetica-Bold'),
+        ('FONTSIZE',   (0, 0), (-1, 0), 7),
+        ('ALIGN',      (0, 0), (-1, 0), 'CENTER'),
+        ('ROWBACKGROUNDS', (0, 1), (-1, -1), [WHITE, row_alt]),
+        ('FONTNAME',   (0, 1), (-1, -1), 'Helvetica'),
+        ('FONTSIZE',   (0, 1), (-1, -1), 7),
+        ('ALIGN',      (1, 1), (-1, -1), 'RIGHT'),
+        ('ALIGN',      (0, 1), (0, -1), 'LEFT'),
+        ('GRID',       (0, 0), (-1, -1), 0.3, LIGHT_GREY),
+        ('TOPPADDING', (0, 0), (-1, -1), 3),
+        ('BOTTOMPADDING', (0, 0), (-1, -1), 3),
+        ('LEFTPADDING', (0, 0), (-1, -1), 4),
+        ('RIGHTPADDING', (0, 0), (-1, -1), 4),
+    ])
+# ─── Section Builders ────────────────────────────────────────────────────────
+def _build_header(report: PortfolioReport) -> List:
+    """Build the header section: advisor brand + client info."""
+    adv = report.advisor
+    cli = report.client
+    today = datetime.now().strftime("%d %B %Y")
+    elements = []
+    # Top bar (advisor on left, date on right)
+    header_data = [[
+        Paragraph(f"<b>{adv.name}</b><br/>"
+                  f"<font size='8' color='#2E75B6'>{adv.location} | {adv.phone} | {adv.email}</font><br/>"
+                  f"<font size='7' color='#888888'>{adv.arn} | AMFI Registered Mutual Fund Distributor</font>",
+                  S("adv", fontName='Helvetica-Bold', fontSize=10, textColor=DARK_BLUE)),
+        Paragraph(f"<para align='right'><font size='8' color='#888888'>"
+                  f"Date: {today}</font></para>",
+                  STYLE_SMALL),
+    ]]
+    header_table = Table(header_data, colWidths=[120*mm, 60*mm])
+    header_table.setStyle(TableStyle([
+        ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
+        ('ALIGN', (1, 0), (1, 0), 'RIGHT'),
+    ]))
+    elements.append(header_table)
+    elements.append(HRFlowable(width="100%", thickness=2, color=MID_BLUE, spaceAfter=4))
+    # Report title
+    elements.append(Paragraph("Investor Portfolio Review", STYLE_TITLE))
+    elements.append(Paragraph("Confidential | Prepared exclusively for the client", STYLE_SUBTITLE))
+    elements.append(HRFlowable(width="100%", thickness=0.5, color=LIGHT_GREY, spaceAfter=6))
+    # Client info box
+    client_info = [
+        ["Client Name", cli.name,       "Age",    str(cli.age or "N/A")],
+        ["Mobile",      cli.mobile or "N/A", "Email", cli.email or "N/A"],
+        ["PAN",         cli.pan or "N/A",   "",      ""],
+    ]
+    ct = Table(client_info, colWidths=[30*mm, 55*mm, 25*mm, 70*mm])
+    ct.setStyle(TableStyle([
+        ('FONTNAME',    (0, 0), (0, -1), 'Helvetica-Bold'),
+        ('FONTNAME',    (2, 0), (2, -1), 'Helvetica-Bold'),
+        ('FONTSIZE',    (0, 0), (-1, -1), 8),
+        ('TEXTCOLOR',   (0, 0), (0, -1), DARK_BLUE),
+        ('TEXTCOLOR',   (2, 0), (2, -1), DARK_BLUE),
+        ('ROWBACKGROUNDS', (0, 0), (-1, -1), [GREY_BG, WHITE]),
+        ('TOPPADDING',  (0, 0), (-1, -1), 3),
+        ('BOTTOMPADDING', (0, 0), (-1, -1), 3),
+        ('LEFTPADDING', (0, 0), (-1, -1), 4),
+        ('GRID',        (0, 0), (-1, -1), 0.3, LIGHT_GREY),
+    ]))
+    elements.append(ct)
+    elements.append(Spacer(1, 4*mm))
+    return elements
+def _build_summary(report: PortfolioReport) -> List:
+    """Portfolio snapshot summary cards."""
+    elements = [Paragraph("📊 Portfolio Snapshot", STYLE_H1)]
+    gain = report.unrealized_gain
+    gain_color = "#27AE60" if gain >= 0 else "#E74C3C"
+    gain_sign  = "+" if gain >= 0 else ""
+    summary_data = [
+        ["Current Value", "Total Invested", "Unrealised Gain", "Sharpe Ratio", "Alpha", "Beta"],
+        [
+            _fmt_inr(report.total_current_value),
+            _fmt_inr(report.total_invested),
+            f"<font color='{gain_color}'>{gain_sign}{_fmt_inr(gain)}</font>",
+            _fmt_float(report.sharpe),
+            _fmt_pct(report.alpha),
+            _fmt_float(report.beta),
+        ],
+    ]
+    def para_cells(row):
+        return [Paragraph(str(c), S("sc", fontSize=8, fontName='Helvetica-Bold' if i < 1 else 'Helvetica',
+                                    alignment=TA_CENTER, textColor=DARK_BLUE))
+                for i, c in enumerate(row)]
+    tbl = Table(
+        [para_cells(summary_data[0]), para_cells(summary_data[1])],
+        colWidths=[30*mm] * 6
+    )
+    tbl.setStyle(TableStyle([
+        ('BACKGROUND',    (0, 0), (-1, 0), DARK_BLUE),
+        ('TEXTCOLOR',     (0, 0), (-1, 0), WHITE),
+        ('FONTNAME',      (0, 0), (-1, 0), 'Helvetica-Bold'),
+        ('FONTSIZE',      (0, 0), (-1, -1), 8),
+        ('ALIGN',         (0, 0), (-1, -1), 'CENTER'),
+        ('ROWBACKGROUNDS', (0, 1), (-1, -1), [LIGHT_BLUE]),
+        ('GRID',          (0, 0), (-1, -1), 0.3, LIGHT_GREY),
+        ('TOPPADDING',    (0, 0), (-1, -1), 5),
+        ('BOTTOMPADDING', (0, 0), (-1, -1), 5),
+    ]))
+    elements.append(tbl)
+    elements.append(Spacer(1, 4*mm))
+    return elements
+def _build_holdings_table(report: PortfolioReport) -> List:
+    """Main holdings table."""
+    elements = [Paragraph("📋 Existing Portfolio Holdings", STYLE_H1)]
+    rows = [["#", "Scheme Name", "Current Value", "Allocation", "Score", "Status"]]
+    for i, h in enumerate(report.holdings, 1):
+        score = _fmt_float(h.fund.score) if h.fund and h.fund.score else "N/A"
+        status = "⚠️ Underperforms" if h.is_underperforming else "✅ On Track"
+        rows.append([
+            str(i),
+            h.scheme_name[:45],
+            _fmt_inr(h.current_value),
+            _fmt_pct(h.allocation_pct),
+            score,
+            status,
+        ])
+    rows.append(["", "<b>TOTAL</b>", _fmt_inr(report.total_current_value), "100%", "", ""])
+    tbl = Table(rows, colWidths=[8*mm, 80*mm, 28*mm, 18*mm, 14*mm, 32*mm])
+    style = _table_style()
+    # Red for underperformers, green for on-track (in status column)
+    for i, h in enumerate(report.holdings, 1):
+        if h.is_underperforming:
+            style.add('TEXTCOLOR', (5, i), (5, i), RED)
+        else:
+            style.add('TEXTCOLOR', (5, i), (5, i), GREEN)
+    # Bold total row
+    style.add('FONTNAME', (0, len(rows)-1), (-1, len(rows)-1), 'Helvetica-Bold')
+    style.add('BACKGROUND', (0, len(rows)-1), (-1, len(rows)-1), LIGHT_BLUE)
+    tbl.setStyle(style)
+    elements.append(tbl)
+    elements.append(Spacer(1, 3*mm))
+    # Exposure warnings
+    if report.exposure_warnings:
+        elements.append(Paragraph("⚠️ Exposure Alerts", STYLE_H2))
+        for warn in report.exposure_warnings:
+            elements.append(Paragraph(warn, STYLE_WARN))
+        elements.append(Spacer(1, 2*mm))
+    return elements
+def _build_allocation_charts(report: PortfolioReport) -> List:
+    """Market cap + sector charts side by side."""
+    elements = [Paragraph("📈 Portfolio Allocation Analysis", STYLE_H1)]
+    # Holdings pie
+    holdings_data = {h.scheme_name: h.current_value for h in report.holdings}
+    pie_buf = ch.holdings_pie_chart(holdings_data, "Fund-wise Allocation")
+    # Market cap pie (use dummy data if not available)
+    mc_data = report.market_cap_allocation or {
+        "Large Cap": 10, "Mid Cap": 45, "Small Cap": 40, "Others": 5
+    }
+    mc_buf = ch.market_cap_pie(mc_data)
+    chart_table = Table(
+        [[_img_from_buf(pie_buf, 85, 70), _img_from_buf(mc_buf, 80, 70)]],
+        colWidths=[90*mm, 90*mm]
+    )
+    chart_table.setStyle(TableStyle([('VALIGN', (0, 0), (-1, -1), 'TOP')]))
+    elements.append(chart_table)
+    elements.append(Spacer(1, 3*mm))
+    # Sector chart
+    if report.sector_allocation:
+        sec_buf = ch.sector_bar_chart(report.sector_allocation)
+        elements.append(Paragraph("🏭 Sector Allocation", STYLE_H2))
+        elements.append(_img_from_buf(sec_buf, 170, 65))
+        elements.append(Spacer(1, 3*mm))
+    return elements
+def _build_scheme_details(report: PortfolioReport) -> List:
+    """Per-scheme detailed analysis blocks."""
+    elements = [PageBreak(), Paragraph("🔍 Individual Scheme Analysis", STYLE_H1)]
+    for h in report.holdings:
+        fund = h.fund
+        if not fund:
+            continue
+        # Scheme header
+        elements.append(Spacer(1, 3*mm))
+        status_color = "#E74C3C" if h.is_underperforming else "#27AE60"
+        status_text  = "Underperforming vs Benchmark" if h.is_underperforming else "Performing Well"
+        elements.append(Paragraph(
+            f"<b>{h.scheme_name}</b> &nbsp;&nbsp;"
+            f"<font color='{status_color}' size='8'>[{status_text}]</font>",
+            STYLE_H2
+        ))
+        # Metrics comparison table
+        periods = ["1 Year", "3 Year", "5 Year", "10 Year"]
+        cagr_vals = [
+            [fund.cagr_1y,   fund.cagr_1y_cat,   fund.cagr_1y_bm],
+            [fund.cagr_3y,   fund.cagr_3y_cat,   fund.cagr_3y_bm],
+            [fund.cagr_5y,   fund.cagr_5y_cat,   fund.cagr_5y_bm],
+            [fund.cagr_10y,  fund.cagr_10y_cat,  fund.cagr_10y_bm],
+        ]
+        cagr_header = ["Period", "Fund CAGR", "Category Avg", "Benchmark"]
+        cagr_rows   = [cagr_header]
+        for period, (f_cagr, cat_cagr, bm_cagr) in zip(periods, cagr_vals):
+            cagr_rows.append([
+                period,
+                _fmt_pct(f_cagr),
+                _fmt_pct(cat_cagr),
+                _fmt_pct(bm_cagr),
+            ])
+        cagr_tbl = Table(cagr_rows, colWidths=[30*mm, 30*mm, 30*mm, 30*mm])
+        cagr_style = _table_style(header_color=MID_BLUE)
+        # Colour fund CAGR red if below benchmark
+        for row_i, (_, (f_cagr, _, bm_cagr)) in enumerate(zip(periods, cagr_vals), 1):
+            if f_cagr is not None and bm_cagr is not None:
+                color = RED if f_cagr < bm_cagr else GREEN
+                cagr_style.add('TEXTCOLOR', (1, row_i), (1, row_i), color)
+        cagr_tbl.setStyle(cagr_style)
+        # Risk metrics row
+        risk_header = ["Alpha", "Beta", "Sharpe", "Std Dev", "Sortino", "Max DD", "Score"]
+        risk_vals   = [
+            _fmt_pct(fund.alpha), _fmt_float(fund.beta),
+            _fmt_float(fund.sharpe), _fmt_pct(fund.std_dev),
+            _fmt_float(fund.sortino), _fmt_pct(fund.max_drawdown),
+            _fmt_float(fund.score),
+        ]
+        risk_tbl = Table(
+            [risk_header, risk_vals],
+            colWidths=[25*mm, 20*mm, 20*mm, 20*mm, 20*mm, 20*mm, 15*mm]
+        )
+        risk_tbl.setStyle(_table_style(header_color=colors.HexColor("#34495E")))
+        # Charts: bar chart for this scheme
+        cagr_chart_data = {
+            "1Y":  {"fund": fund.cagr_1y,  "benchmark": fund.cagr_1y_bm,  "category": fund.cagr_1y_cat},
+            "3Y":  {"fund": fund.cagr_3y,  "benchmark": fund.cagr_3y_bm,  "category": fund.cagr_3y_cat},
+            "5Y":  {"fund": fund.cagr_5y,  "benchmark": fund.cagr_5y_bm,  "category": fund.cagr_5y_cat},
+            "10Y": {"fund": fund.cagr_10y, "benchmark": fund.cagr_10y_bm, "category": fund.cagr_10y_cat},
+        }
+        chart_buf = ch.holding_vs_benchmark_chart(fund.name, cagr_chart_data)
+        row_layout = Table(
+            [[cagr_tbl, _img_from_buf(chart_buf, 80, 55)]],
+            colWidths=[100*mm, 80*mm]
+        )
+        row_layout.setStyle(TableStyle([('VALIGN', (0, 0), (-1, -1), 'TOP')]))
+        block = KeepTogether([
+            row_layout,
+            Spacer(1, 2*mm),
+            risk_tbl,
+        ])
+        # Switch suggestion section
+        if h.suggested_fund:
+            sf = h.suggested_fund
+            elements.append(block)
+            elements.append(Paragraph(
+                f"💡 <b>Suggested Switch:</b> {h.scheme_name} → <b>{sf.name}</b>",
+                STYLE_H2
+            ))
+            comp_data = [
+                ["Metric",       "Current Fund",          "Suggested Fund"],
+                ["3Y CAGR",      _fmt_pct(fund.cagr_3y),  _fmt_pct(sf.cagr_3y)],
+                ["5Y CAGR",      _fmt_pct(fund.cagr_5y),  _fmt_pct(sf.cagr_5y)],
+                ["Alpha",        _fmt_pct(fund.alpha),    _fmt_pct(sf.alpha)],
+                ["Sharpe",       _fmt_float(fund.sharpe), _fmt_float(sf.sharpe)],
+                ["TER",          _fmt_pct(fund.ter),      _fmt_pct(sf.ter)],
+                ["Score",        _fmt_float(fund.score),  _fmt_float(sf.score)],
+            ]
+            comp_tbl = Table(comp_data, colWidths=[40*mm, 60*mm, 60*mm])
+            comp_style = _table_style(header_color=colors.HexColor("#8E44AD"))
+            comp_tbl.setStyle(comp_style)
+            elements.append(comp_tbl)
+        else:
+            elements.append(block)
+        elements.append(HRFlowable(width="100%", thickness=0.5, color=LIGHT_GREY, spaceAfter=2))
+    return elements
+def _build_wealth_projection(report: PortfolioReport) -> List:
+    """Wealth projection table and chart."""
+    elements = [PageBreak(), Paragraph("💰 Wealth Projection @ 12% p.a.", STYLE_H1)]
+    proj = report.wealth_projection
+    if not proj:
+        return elements
+    proj_data = [["Time Horizon", "Projected Value", "Approx. Growth"]]
+    current = report.total_current_value
+    for yr, val in sorted(proj.items()):
+        growth = ((val - current) / current * 100) if current else 0
+        proj_data.append([f"{yr} Years", _fmt_inr(val), f"+{growth:.1f}%"])
+    proj_tbl = Table(proj_data, colWidths=[40*mm, 60*mm, 40*mm])
+    proj_tbl.setStyle(_table_style())
+    elements.append(proj_tbl)
+    elements.append(Spacer(1, 4*mm))
+    # Chart
+    wc_buf = ch.wealth_projection_chart(proj, current)
+    elements.append(_img_from_buf(wc_buf, 160, 70))
+    elements.append(Spacer(1, 4*mm))
+    return elements
+DISCLAIMER_TEXT = (
+    "Disclaimer: We have gathered all the data, information, and statistics from sources believed to be "
+    "highly reliable and true. All necessary precautions have been taken to avoid any error, lapse or "
+    "insufficiency; however, no representations or warranties are made (express or implied) as to the "
+    "reliability, accuracy or completeness of such information. We cannot be held liable for any loss "
+    "arising directly or indirectly from the use of, or any action taken on, any information appearing herein. "
+    "The user is advised to verify the contents of the report independently. It is not an Investment recommendation "
+    "or personal financial, Investment or professional advice and should not be treated as such. The Risk Level of "
+    "any of the schemes must always be commensurate with the risk profile, Investment objective or financial goals "
+    "of the investor concerned. Returns less than 1 year are in absolute (%) and greater than 1 year are compounded "
+    "annualised (CAGR %). SIP returns are shown in XIRR (%). Mutual Fund Investments are subject to market risks, "
+    "read all scheme related documents carefully. Past performance may or may not be sustained in the future."
+)
+# ─── Main Generator ──────────────────────────────────────────────────────────
+def _build_quartile_section(report: PortfolioReport) -> List:
+    """
+    Quartile Analysis Grid — based on senior's handwritten sketch.
+    Shows BM / Category / Scheme rows × 1Y/3Y/5Y/10Y columns per holding.
+    Scheme row is color-coded: Q1(green)/Q2(blue)/Q3(yellow)/Q4(red).
+    """
+    elements = [Paragraph("📊 Quartile Analysis — Scheme vs Benchmark & Category", STYLE_H1)]
+    elements.append(Paragraph(
+        "Each scheme is compared against its Benchmark Index and Category Average "
+        "across 1Y/3Y/5Y/10Y periods. The Scheme row shows CAGR and is color-coded "
+        "by quartile rank (Q1=Top, Q4=Bottom). ✓ = Fund beats Benchmark that period.",
+        STYLE_SMALL
+    ))
+    elements.append(Spacer(1, 2*mm))
+    grid_data = []
+    for h in report.holdings:
+        f = h.fund
+        if not f:
+            continue
+        rank  = f.rank_in_category or 1
+        total = rank * 4   # approximate — will be corrected when fund_universe passed
+        grid_data.append({
+            "scheme_name":       h.scheme_name,
+            "rank_in_category":  rank,
+            "total_in_category": total,
+            "cagr_1y": f.cagr_1y,  "cagr_1y_bm": f.cagr_1y_bm,  "cagr_1y_cat": f.cagr_1y_cat,
+            "cagr_3y": f.cagr_3y,  "cagr_3y_bm": f.cagr_3y_bm,  "cagr_3y_cat": f.cagr_3y_cat,
+            "cagr_5y": f.cagr_5y,  "cagr_5y_bm": f.cagr_5y_bm,  "cagr_5y_cat": f.cagr_5y_cat,
+            "cagr_10y": f.cagr_10y,"cagr_10y_bm": f.cagr_10y_bm,"cagr_10y_cat": f.cagr_10y_cat,
+        })
+    if grid_data:
+        grid_buf = ch.quartile_analysis_grid(grid_data)
+        n       = len(grid_data)
+        chart_h = max(75, n * 28)
+        elements.append(_img_from_buf(grid_buf, 175, min(chart_h, 210)))
+    else:
+        elements.append(Paragraph("No matched fund data available.", STYLE_BODY))
+    elements.append(Spacer(1, 3*mm))
+    return elements
+def generate_pdf(report: PortfolioReport, output_path: str) -> str:
+    """
+    Generate the complete PDF report.
+    Returns: path to the generated PDF.
+    """
+    output_path = Path(output_path)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    doc = SimpleDocTemplate(
+        str(output_path),
+        pagesize=A4,
+        leftMargin=MARGIN,
+        rightMargin=MARGIN,
+        topMargin=MARGIN,
+        bottomMargin=MARGIN,
+    )
+    story = []
+    # ── Page 1 ──────────────────────────────────────────────────────────
+    story += _build_header(report)
+    story += _build_summary(report)
+    story += _build_holdings_table(report)
+    story += _build_quartile_section(report)
+    story += _build_allocation_charts(report)
+    # ── Per-scheme details ───────────────────────────────────────────────
+    story += _build_scheme_details(report)
+    # ── Wealth projection ────────────────────────────────────────────────
+    story += _build_wealth_projection(report)
+    # ── Disclaimer ───────────────────────────────────────────────────────
+    story.append(HRFlowable(width="100%", thickness=0.5, color=LIGHT_GREY))
+    story.append(Spacer(1, 3*mm))
+    story.append(Paragraph("Disclaimer", STYLE_H2))
+    story.append(Paragraph(DISCLAIMER_TEXT, STYLE_DISCLAIMER))
+    doc.build(story)
+    return str(output_path)

src/portfolio_engine.py ADDED Viewed

	@@ -0,0 +1,299 @@

+"""
+Portfolio Engine: -p mode
+Loads a client CSV, matches holdings to the fund universe,
+computes portfolio metrics, exposure checks, and wealth projection.
+"""
+import csv
+import numpy as np
+from pathlib import Path
+from typing import List, Optional, Dict
+from src.models import Fund, Client, ClientHolding, Advisor, PortfolioReport
+# ─── Client CSV Loader ───────────────────────────────────────────────────────
+def load_client_csv(csv_path: str) -> tuple[Client, List[ClientHolding]]:
+    """
+    Load client data from CSV.
+    Expected CSV format:
+    Line 1: Name, Age, Email, Mobile[, PAN]
+    Line 2+: Scheme Name, Current Value, Invested Amount, SIP Amount, SIP Frequency
+    Example:
+        Parthiban,45,parthiban@gmail.com,9876543210,ABCDE1234F
+        Nippon India Small Cap Fund,280923,200000,5000,Monthly
+        HDFC Mid Cap Fund,134562,120000,3000,Monthly
+    """
+    csv_path = Path(csv_path)
+    if not csv_path.exists():
+        raise FileNotFoundError(f"Client CSV not found: {csv_path}")
+    with open(csv_path, encoding='utf-8-sig', errors='replace') as f:
+        reader = csv.reader(f)
+        rows = [r for r in reader if any(c.strip() for c in r)]
+    if not rows:
+        raise ValueError("Client CSV is empty")
+    # Parse client info from first row
+    info = rows[0]
+    client = Client(
+        name=info[0].strip() if len(info) > 0 else "Unknown",
+        age=int(info[1]) if len(info) > 1 and info[1].strip().isdigit() else None,
+        email=info[2].strip() if len(info) > 2 else None,
+        mobile=info[3].strip() if len(info) > 3 else None,
+        pan=info[4].strip() if len(info) > 4 else None,
+    )
+    # Parse holdings from remaining rows
+    holdings: List[ClientHolding] = []
+    for row in rows[1:]:
+        if not row or not row[0].strip():
+            continue
+        # Skip header-like rows
+        if row[0].strip().lower() in ('scheme name', 'fund', 'scheme'):
+            continue
+        def safe_float(v):
+            try:
+                return float(str(v).replace(',', '').strip())
+            except (ValueError, TypeError):
+                return None
+        holding = ClientHolding(
+            scheme_name=row[0].strip(),
+            current_value=safe_float(row[1]) or 0.0,
+            invested_amount=safe_float(row[2]) if len(row) > 2 else None,
+            sip_amount=safe_float(row[3]) if len(row) > 3 else None,
+            sip_frequency=row[4].strip() if len(row) > 4 else None,
+        )
+        holdings.append(holding)
+    return client, holdings
+# ─── Fund Matcher ────────────────────────────────────────────────────────────
+def match_holdings_to_funds(holdings: List[ClientHolding], funds: List[Fund]) -> List[ClientHolding]:
+    """
+    Fuzzy-match each client holding to a fund in the universe.
+    Uses token overlap on lowercased fund names.
+    """
+    def tokenize(name: str) -> set:
+        stopwords = {'fund', 'regular', 'plan', 'growth', 'option', 'direct',
+                     'idcw', 'div', 'dividend', '-', 'the', 'india', 'of'}
+        tokens = set(name.lower().replace('-', ' ').split())
+        return tokens - stopwords
+    fund_tokens = [(f, tokenize(f.name)) for f in funds]
+    for holding in holdings:
+        h_tokens = tokenize(holding.scheme_name)
+        if not h_tokens:
+            continue
+        best_fund = None
+        best_score = 0
+        for fund, f_tokens in fund_tokens:
+            if not f_tokens:
+                continue
+            intersection = len(h_tokens & f_tokens)
+            union = len(h_tokens | f_tokens)
+            jaccard = intersection / union if union else 0
+            if jaccard > best_score:
+                best_score = jaccard
+                best_fund = fund
+        if best_score > 0.15:  # minimum match threshold
+            holding.fund = best_fund
+    return holdings
+# ─── Portfolio Analysis ──────────────────────────────────────────────────────
+def compute_allocation(holdings: List[ClientHolding]) -> List[ClientHolding]:
+    """Compute each holding's % allocation of total portfolio."""
+    total = sum(h.current_value for h in holdings)
+    if total == 0:
+        return holdings
+    for h in holdings:
+        h.allocation_pct = round((h.current_value / total) * 100, 2)
+    return holdings
+def check_exposure(holdings: List[ClientHolding]) -> tuple[Dict, Dict, List[str]]:
+    """
+    Check AMC and scheme-level exposure.
+    Returns (amc_exposure, scheme_exposure, warnings).
+    """
+    total = sum(h.current_value for h in holdings)
+    if total == 0:
+        return {}, {}, []
+    amc_exposure: Dict[str, float] = {}
+    scheme_exposure: Dict[str, float] = {}
+    warnings: List[str] = []
+    for h in holdings:
+        pct = h.allocation_pct
+        scheme_exposure[h.scheme_name] = pct
+        # Extract AMC name (first word(s) before "-")
+        amc = h.scheme_name.split('-')[0].strip()
+        amc_exposure[amc] = amc_exposure.get(amc, 0) + pct
+    THRESHOLD = 20.0
+    for amc, pct in amc_exposure.items():
+        if pct > THRESHOLD:
+            warnings.append(f"⚠️  AMC Exposure Alert: {amc} = {pct:.1f}% (>{THRESHOLD}% threshold)")
+    for scheme, pct in scheme_exposure.items():
+        if pct > THRESHOLD:
+            warnings.append(f"⚠️  Scheme Exposure Alert: {scheme} = {pct:.1f}% (>{THRESHOLD}% threshold)")
+    return amc_exposure, scheme_exposure, warnings
+def compute_portfolio_metrics(holdings: List[ClientHolding]) -> Dict:
+    """
+    Compute portfolio-level weighted average risk metrics.
+    """
+    total = sum(h.current_value for h in holdings)
+    if total == 0:
+        return {}
+    metrics = {"sharpe": 0.0, "alpha": 0.0, "beta": 0.0, "std_dev": 0.0}
+    for h in holdings:
+        w = h.current_value / total
+        if h.fund:
+            if h.fund.sharpe is not None:
+                metrics["sharpe"] += w * h.fund.sharpe
+            if h.fund.alpha is not None:
+                metrics["alpha"] += w * h.fund.alpha
+            if h.fund.beta is not None:
+                metrics["beta"] += w * h.fund.beta
+            if h.fund.std_dev is not None:
+                metrics["std_dev"] += w * h.fund.std_dev
+    return {k: round(v, 4) for k, v in metrics.items()}
+def flag_underperformers(holdings: List[ClientHolding]) -> List[ClientHolding]:
+    """
+    Flag a holding as underperforming if its fund's CAGR fails to outperform
+    EITHER the BM Index OR the Category Average across multiple time periods.
+    Rule (from senior advisor's framework):
+      A fund's CAGR should:
+        1. Outperform the BM Index across time periods (1Y, 3Y, 5Y)
+        2. Outperform the category average across time periods
+        3. Have superior risk metrics (handled separately via score)
+    A fund is flagged if it underperforms on 2+ out of 3 periods
+    on EITHER benchmark OR category average.
+    """
+    PERIODS = [
+        ("1Y", "cagr_1y", "cagr_1y_bm", "cagr_1y_cat"),
+        ("3Y", "cagr_3y", "cagr_3y_bm", "cagr_3y_cat"),
+        ("5Y", "cagr_5y", "cagr_5y_bm", "cagr_5y_cat"),
+    ]
+    for h in holdings:
+        f = h.fund
+        if not f:
+            continue
+        bm_fails  = 0
+        cat_fails = 0
+        checked   = 0
+        for label, cagr_attr, bm_attr, cat_attr in PERIODS:
+            fund_cagr = getattr(f, cagr_attr, None)
+            bm_cagr   = getattr(f, bm_attr,   None)
+            cat_cagr  = getattr(f, cat_attr,  None)
+            if fund_cagr is None:
+                continue
+            checked += 1
+            if bm_cagr  is not None and fund_cagr < bm_cagr:
+                bm_fails  += 1
+            if cat_cagr is not None and fund_cagr < cat_cagr:
+                cat_fails += 1
+        # Flag if underperforms BM on 2+ periods OR underperforms category on 2+ periods
+        if checked > 0 and (bm_fails >= 2 or cat_fails >= 2):
+            h.is_underperforming = True
+    return holdings
+def compute_wealth_projection(total_value: float, years_list: list = [5, 10, 15, 20],
+                               rate: float = 0.12) -> Dict:
+    """Project portfolio value at a fixed annual return rate."""
+    return {
+        yr: round(total_value * ((1 + rate) ** yr), 2)
+        for yr in years_list
+    }
+# ─── Main entry ──────────────────────────────────────────────────────────────
+def run_portfolio_engine(
+    client_csv: str,
+    fund_universe: List[Fund],
+    advisor: Optional[Advisor] = None,
+) -> PortfolioReport:
+    """
+    Full pipeline: load client → match funds → analyse → build report object.
+    """
+    if advisor is None:
+        advisor = Advisor()
+    print(f"📂 Loading client data from: {client_csv}")
+    client, holdings = load_client_csv(client_csv)
+    print(f"   Client: {client.name} | Holdings: {len(holdings)}")
+    print("🔗 Matching holdings to fund universe...")
+    holdings = match_holdings_to_funds(holdings, fund_universe)
+    matched = sum(1 for h in holdings if h.fund is not None)
+    print(f"   Matched {matched}/{len(holdings)} holdings")
+    holdings = compute_allocation(holdings)
+    amc_exp, scheme_exp, warnings = check_exposure(holdings)
+    holdings = flag_underperformers(holdings)
+    metrics = compute_portfolio_metrics(holdings)
+    total_current = sum(h.current_value for h in holdings)
+    total_invested = sum(h.invested_amount or 0 for h in holdings)
+    wealth_projection = compute_wealth_projection(total_current)
+    report = PortfolioReport(
+        client=client,
+        advisor=advisor,
+        holdings=holdings,
+        total_current_value=total_current,
+        total_invested=total_invested,
+        unrealized_gain=total_current - total_invested,
+        sharpe=metrics.get("sharpe"),
+        alpha=metrics.get("alpha"),
+        beta=metrics.get("beta"),
+        std_dev=metrics.get("std_dev"),
+        amc_exposure=amc_exp,
+        scheme_exposure=scheme_exp,
+        exposure_warnings=warnings,
+        wealth_projection=wealth_projection,
+    )
+    if warnings:
+        print("\n".join(warnings))
+    return report

src/reference_data.py ADDED Viewed

	@@ -0,0 +1,142 @@

+"""
+Reference data extractor from Processed_data.xlsx
+This module extracts BM Index, Category Average, and fund weightage data that the advisor
+has manually filled in Processed_data.xlsx, so we can use it when processing
+raw CSV files that have blank BM/Category rows.
+"""
+import openpyxl
+from typing import Dict, Any, Optional, Tuple
+from pathlib import Path
+def extract_reference_data(processed_xlsx_path: str) -> Tuple[Dict[str, Dict[str, Any]], Dict[str, Dict[str, Any]], Dict[str, int]]:
+    """
+    Extract BM Index, Category Average, and fund weightage data from Processed_data.xlsx.
+    Returns:
+        (bm_data, cat_avg_data, fund_weightages) where:
+        - bm_data: dict mapping category name to CAGR values for BM Index
+        - cat_avg_data: dict mapping category name to CAGR/values for Category Average
+        - fund_weightages: dict mapping fund name to manually adjusted weightage value
+    """
+    xlsx_path = Path(processed_xlsx_path)
+    if not xlsx_path.exists():
+        print(f"Warning: Reference file not found: {processed_xlsx_path}")
+        return {}, {}, {}
+    wb = openpyxl.load_workbook(str(xlsx_path))
+    ws = wb.active
+    bm_data = {}
+    cat_avg_data = {}
+    fund_weightages = {}
+    current_category = None
+    # Find the Weightage column index by scanning the header row
+    weightage_col_idx = None
+    for col_idx in range(1, ws.max_column + 1):
+        header_val = ws.cell(1, col_idx).value
+        if header_val and 'Weightage' in str(header_val):
+            weightage_col_idx = col_idx
+            break
+    for i in range(1, ws.max_row + 1):
+        cell_val = ws.cell(i, 1).value
+        # Check if it's a category header
+        if cell_val and ':' in str(cell_val) and any(x in str(cell_val) for x in ['Equity', 'Debt', 'Hybrid', 'Solution', 'Other']):
+            current_category = cell_val
+        # Check if it's BM Index row
+        elif cell_val == 'BM Index' and current_category:
+            bm_1y = ws.cell(i, 6).value
+            bm_3y = ws.cell(i, 7).value
+            bm_5y = ws.cell(i, 8).value
+            bm_10y = ws.cell(i, 9).value
+            # Only store if at least one value is present
+            if any([bm_1y, bm_3y, bm_5y, bm_10y]):
+                bm_data[current_category] = {
+                    'cagr_1y': bm_1y,
+                    'cagr_3y': bm_3y,
+                    'cagr_5y': bm_5y,
+                    'cagr_10y': bm_10y
+                }
+        # Check if it's Category Average row
+        elif cell_val == 'Category Average' and current_category:
+            cat_1y = ws.cell(i, 6).value
+            cat_3y = ws.cell(i, 7).value
+            cat_5y = ws.cell(i, 8).value
+            cat_10y = ws.cell(i, 9).value
+            pe = ws.cell(i, 12).value
+            pb = ws.cell(i, 13).value
+            # Only store if at least one CAGR value is present
+            if any([cat_1y, cat_3y, cat_5y, cat_10y]):
+                cat_avg_data[current_category] = {
+                    'cagr_1y': cat_1y,
+                    'cagr_3y': cat_3y,
+                    'cagr_5y': cat_5y,
+                    'cagr_10y': cat_10y,
+                    'pe_ratio': pe,
+                    'pb_ratio': pb
+                }
+        # Check if it's a fund row (not category header, BM Index, or Category Average)
+        elif cell_val and cell_val not in ['BM Index', 'Category Average', 'Fund'] and current_category:
+            # Extract fund name
+            fund_name = str(cell_val).strip()
+            # Extract weightage if we found the Weightage column
+            if weightage_col_idx:
+                weightage_val = ws.cell(i, weightage_col_idx).value
+                if weightage_val is not None:
+                    try:
+                        # Convert to int if possible, otherwise round float to nearest int
+                        if isinstance(weightage_val, float):
+                            fund_weightages[fund_name] = int(round(weightage_val))
+                        else:
+                            fund_weightages[fund_name] = int(weightage_val)
+                    except (ValueError, TypeError):
+                        # If conversion fails, skip this fund
+                        pass
+    wb.close()
+    print(f"Loaded reference data: {len(bm_data)} categories with BM Index, {len(cat_avg_data)} with Category Average, {len(fund_weightages)} fund weightages")
+    return bm_data, cat_avg_data, fund_weightages
+def get_fund_weightage_from_reference(fund_name: str, fund_weightages: Dict[str, int]) -> Optional[int]:
+    """
+    Get the manually adjusted weightage for a fund from reference data.
+    Args:
+        fund_name: Name of the fund
+        fund_weightages: Dictionary of fund name to weightage from Processed_data.xlsx
+    Returns:
+        Weightage value if found, None otherwise
+    """
+    # Try exact match first
+    if fund_name in fund_weightages:
+        return fund_weightages[fund_name]
+    # Try partial match (in case of slight name differences)
+    for ref_fund_name, weightage in fund_weightages.items():
+        if fund_name.lower() in ref_fund_name.lower() or ref_fund_name.lower() in fund_name.lower():
+            return weightage
+    return None
+# Default reference file path
+DEFAULT_REFERENCE_PATH = "PS/Processed data.xlsx"
+# Default reference file path
+DEFAULT_REFERENCE_PATH = "PS/Processed data.xlsx"

src/scheme_resolver.py ADDED Viewed

	@@ -0,0 +1,323 @@

+"""Scheme Code Resolver
+======================
+Resolves missing AMFI scheme codes by fuzzy-matching the fund name from the
+CSV against mfapi.in's /mf/search endpoint.
+This runs as a PRE-TRIAGE step so that the NAV engine can fire for funds whose
+scheme code was absent from the CSV.
+"""
+from __future__ import annotations
+import difflib
+import re
+import time
+import requests
+MFAPI_SEARCH = "https://api.mfapi.in/mf/search"
+MATCH_CUTOFF  = 0.52    # minimum SequenceMatcher ratio to accept
+SLEEP_BETWEEN = 0.25    # seconds between API calls (polite rate limit)
+# Manual overrides for schemes that mfapi's search endpoint does not
+# currently return, but whose AMFI codes are known and stable. Keys are
+# normalized fund names (see _normalize).
+SCHEME_OVERRIDES: dict[str, str] = {
+    # ── Pre-verified from AMFI NAV master (portal.amfiindia.com) ──────────────
+    # These funds have empty scheme codes in source CSV and cannot be reliably
+    # resolved via mfapi fuzzy search. Codes are Regular Plan - Growth only.
+    # Existing override
+    "kotak tax saver scheme growth": "109234",
+    # ── Debt: Banking and PSU ─────────────────────────────────────────────────
+    "hdfc banking and psu debt fund growth option":         "128628",
+    "icici prudential banking and psu debt fund growth":    "112342",
+    "kotak banking and psu debt growth":                    "123690",
+    "invesco india banking and psu fund growth option":     "118232",
+    "sundaram banking psu fund formerly known as sundaram banking and psu debt fund regular plan growth": "100784",
+    "hsbc banking and psu debt fund regular growth":        "151104",
+    "iti banking psu debt fund regular plan growth option": "148535",
+    # ── Debt: Liquid ──────────────────────────────────────────────────────────
+    "dsp liquidity fund regular plan growth":               "119120",
+    "invesco india liquid fund growth":                     "104488",
+    "invesco india liquid fund regular growth":             "118769",
+    "union liquid fund growth option":                      "115398",
+    "parag parikh liquid fund regular plan growth":         "149038",
+    "motilal oswal liquid fund regular growth":             "147622",
+    "iti liquid fund regular plan growth option":           "147153",
+    "quantum liquid fund regular plan growth option":       "103504",
+    "lic mf liquid fund regular plan growth":               "120716",
+    "icici prudential liquid fund growth":                  "120593",
+    "aditya birla sun life liquid fund retail growth":      "100042",
+    "aditya birla sun life liquid fund growth":             "100047",
+    "edelweiss liquid fund regular plan growth option":     "140182",
+    "edelweiss liquid fund retail plan growth option":      "119114",
+    "axis liquid fund retail plan growth option":           "112090",
+    "sbi liquid fund regular plan growth":                  "119822",
+    "nippon india liquid fund retail option growth plan":   "100837",
+    # ── Debt: Overnight ───────────────────────────────────────────────────────
+    "uti overnight fund regular plan growth option":            "100814",
+    "canara robeco overnight fund regular plan growth option":  "147534",
+    "dsp overnight fund regular plan growth":                   "146061",
+    "franklin india overnight fund growth":                     "146210",
+    "bandhan overnight fund regular plan growth":               "146187",
+    "iti overnight fund regular plan growth option":            "148529",
+    "union overnight fund regular plan growth option":          "146997",
+    "icici prudential overnight fund growth":                   "145811",
+    "edelweiss overnight fund regular plan growth":             "147569",
+    "lic mf overnight fund regular plan growth":                "146065",
+    "hdfc overnight fund growth option":                        "145822",
+    # ── Debt: Ultra Short Duration ────────────────────────────────────────────
+    "icici prudential ultra short term fund growth":                    "120505",
+    "invesco india ultra short duration fund growth":                   "117825",
+    "uti ultra short duration fund regular plan growth option":         "102532",
+    "aditya birla sun life savings fund growth regular plan":           "119293",
+    "aditya birla sun life savings fund retail growth":                 "119293",
+    "hdfc ultra short term fund growth option":                         "145539",
+    "aditya birla sun life savings fund discipline advantage plan":     "112016",
+    "pgim india ultra short duration fund growth":                      "100474",
+    "iti ultra short duration fund regular plan growth option":         "148533",
+    "motilal oswal ultra short term fund mofustf regular plan growth":  "124233",
+    "tata ultra short term fund regular plan growth":                   "146070",
+    "kotak savings fund growth":                                        "119270",
+    "lic mf ultra short duration fund regular plan growth":             "147770",
+    "canara robeco ultra short term fund regular plan growth option":   "119671",
+    "sundaram ultra short duration fund formerly known as principal ultra short term fund growth option": "120826",
+    "bank of india ultra short duration fund regular plan growth":      "109269",
+    # ── Debt: Short Duration ──────────────────────────────────────────────────
+    "hdfc short term debt fund growth option":                    "119247",
+    "icici prudential short term fund growth option":             "101758",
+    "sbi short horizon debt fund short term fund retail growth":  "106227",
+    "sbi short term debt fund regular plan growth":               "119831",
+    "kotak bond short term plan growth":                          "101373",
+    "dsp short term fund regular plan growth":                    "119598",
+    "lic mf short duration fund regular plan growth":             "145952",
+    "mirae asset short duration fund regular plan growth":        "148416",
+    "invesco india short duration fund growth":                   "105185",
+    "canara robeco short duration fund regular plan growth option": "119675",
+    "groww short duration fund formerly known as indiabulls short term fund regular plan growth option": "123708",
+    "tata short term bond fund regular plan growth option":       "119802",
+    # ── Debt: Medium Duration ─────────────────────────────────────────────────
+    "aditya birla sun life medium term plan growth regular plan": "111803",
+    "axis strategic bond fund regular plan growth option":        "116894",
+    "icici prudential medium term bond fund growth":              "120841",
+    "hdfc medium term debt fund growth option":                   "119238",
+    "kotak medium term fund growth":                              "119281",
+    "dsp bond fund growth":                                       "100078",
+    "sundaram medium duration fund formerly known as sundaram medium term bond fund regular plan growth": "100603",
+    # ── ETFs ──────────────────────────────────────────────────────────────────
+    "hdfc nifty100 low volatility 30 etf growth option":  "145748",
+    "hdfc nifty200 momentum 30 etf growth option":        "146058",
+    "hdfc nifty it etf growth option":                    "120493",
+    "hdfc nifty private bank etf growth option":          "145696",
+    # ── Index Funds ───────────────────────────────────────────────────────────
+    "dsp nifty next 50 index fund regular plan growth":         "143669",
+    "uti nifty next 50 index fund regular plan growth option":  "120713",
+    "motilal oswal nifty smallcap 250 index regular plan":      "147960",
+    "icici prudential nifty pharma index fund growth":          "143874",
+    "dsp nifty 50 index fund regular plan growth":              "143537",
+    "motilal oswal nifty midcap 150 index fund regular plan":   "147068",
+    "sbi nifty index fund regular plan growth":                 "135818",
+    "motilal oswal nifty bank index regular plan":              "145552",
+}
+def _normalize(name: str) -> str:
+    """Convert hyphenated CSV name to a clean lowercase string."""
+    return re.sub(r"[-_]+", " ", name).strip().lower()
+def _search_query(name: str) -> str:
+    """Take first 6 tokens for a focused search query."""
+    return " ".join(_normalize(name).split()[:6])
+def _search_mfapi(query: str) -> list[dict]:
+    try:
+        resp = requests.get(MFAPI_SEARCH, params={"q": query}, timeout=15)
+        resp.raise_for_status()
+        return resp.json()
+    except Exception as exc:
+        print(f"  [resolver] search error for '{query}': {exc}")
+        return []
+def _best_match(candidates: list[dict], target_name: str) -> dict | None:
+    if not candidates:
+        return None
+    target = _normalize(target_name)
+    best_score = 0.0
+    best_item  = None
+    for item in candidates:
+        candidate = _normalize(item.get("schemeName", ""))
+        score = difflib.SequenceMatcher(None, target, candidate).ratio()
+        if score > best_score:
+            best_score = score
+            best_item  = item
+    if best_score >= MATCH_CUTOFF:
+        return best_item
+    return None
+def _is_valid_scheme_code(code: str) -> bool:
+    """AMFI scheme codes are purely numeric (e.g. 120586). Platform codes like GROWWEH are invalid."""
+    return bool(code and code.isdigit())
+def resolve_scheme_code_for_fund_name(
+    fund_name: str,
+) -> tuple[str | None, str | None]:
+    """
+    Resolve a scheme code for one fund name.
+    Resolution order:
+    1. Exact normalized-name override from SCHEME_OVERRIDES
+    2. mfapi search + fuzzy best-match
+    """
+    norm = _normalize(fund_name)
+    override_code = SCHEME_OVERRIDES.get(norm)
+    if override_code:
+        return override_code, "override"
+    query = _search_query(fund_name)
+    candidates = _search_mfapi(query)
+    match = _best_match(candidates, fund_name)
+    if match:
+        return str(match["schemeCode"]), match.get("schemeName", "")
+    return None, None
+def resolve_missing_scheme_codes(
+    rows: list[dict[str, str]],
+    *,
+    verbose: bool = True,
+) -> tuple[list[dict[str, str]], dict[str, str]]:
+    """
+    Resolve blank scheme codes and also correct any exact-name rows whose
+    current numeric code disagrees with SCHEME_OVERRIDES.
+    Blank/invalid codes are resolved via SCHEME_OVERRIDES (O(1) dict lookup)
+    first, then mfapi search in parallel.
+    Complexity: O(N) time, O(N) space where N = funds with missing codes.
+    Network I/O parallelised with ThreadPoolExecutor(20) — pure I/O bound.
+    """
+    from concurrent.futures import ThreadPoolExecutor, as_completed
+    resolved: dict[str, str] = {}
+    corrected_existing = 0
+    # ── Collect rows that need resolution ─────────────────────────────────────
+    target_rows: list[dict[str, str]] = []
+    for row in rows:
+        fund_name = (row.get("Fund") or "").strip()
+        if not fund_name or fund_name.count("-") < 2 or ":" in fund_name:
+            continue
+        norm = _normalize(fund_name)
+        raw_code = (row.get("Scheme Code") or "").strip()
+        override_code = SCHEME_OVERRIDES.get(norm)
+        # Future-proofing: if we know the canonical code for this exact fund name,
+        # correct it even when the CSV already contains a numeric but stale code.
+        if override_code and raw_code != override_code:
+            row["Scheme Code"] = override_code
+            resolved[fund_name] = override_code
+            corrected_existing += 1
+            continue
+        if _is_valid_scheme_code(raw_code):
+            continue
+        if raw_code and not _is_valid_scheme_code(raw_code):
+            row["Scheme Code"] = ""   # clear invalid platform codes e.g. GROWWEH
+        target_rows.append(row)
+    total_missing = len(target_rows)
+    if total_missing == 0:
+        if verbose:
+            if corrected_existing:
+                print(f"[resolver] Corrected {corrected_existing} existing scheme codes via override table.")
+            else:
+                print("[resolver] No missing scheme codes found.")
+        return rows, resolved
+    if verbose:
+        print(f"[resolver] Resolving {total_missing} missing scheme codes (parallel)…")
+    # ── Phase A: Override table — O(1) per fund, no network ───────────────────
+    mfapi_needed: list[dict[str, str]] = []
+    override_count = 0
+    for row in target_rows:
+        fund_name = (row.get("Fund") or "").strip()
+        norm = _normalize(fund_name)
+        code = SCHEME_OVERRIDES.get(norm)
+        if code:
+            row["Scheme Code"] = code
+            resolved[fund_name] = code
+            override_count += 1
+        else:
+            mfapi_needed.append(row)
+    if verbose and override_count:
+        print(f"  [resolver] {override_count} resolved via override table (instant)")
+    if verbose and corrected_existing:
+        print(f"  [resolver] {corrected_existing} existing codes corrected via override table")
+    # ── Phase B: mfapi search — parallel ThreadPoolExecutor ───────────────────
+    if not mfapi_needed:
+        if verbose:
+            print(f"[resolver] Done. {len(resolved)}/{total_missing} resolved.")
+        return rows, resolved
+    lock = __import__("threading").Lock()
+    completed = [0]
+    def _resolve_one(row: dict[str, str]) -> tuple[str, str | None, str | None]:
+        """Returns (fund_name, scheme_code_or_None, matched_name_or_None)."""
+        fund_name  = (row.get("Fund") or "").strip()
+        query      = _search_query(fund_name)
+        candidates = _search_mfapi(query)
+        match      = _best_match(candidates, fund_name)
+        if match:
+            return fund_name, str(match["schemeCode"]), match.get("schemeName", "")
+        return fund_name, None, None
+    # 20 workers: mfapi is pure REST, stateless, handles concurrency fine
+    with ThreadPoolExecutor(max_workers=20) as executor:
+        future_to_row = {executor.submit(_resolve_one, row): row for row in mfapi_needed}
+        for future in as_completed(future_to_row):
+            row = future_to_row[future]
+            fund_name = (row.get("Fund") or "").strip()
+            try:
+                _, code, matched_name = future.result()
+            except Exception:
+                code = matched_name = None
+            with lock:
+                completed[0] += 1
+                n = completed[0]
+                total_mfapi = len(mfapi_needed)
+                if code:
+                    row["Scheme Code"] = code
+                    resolved[fund_name] = code
+                    if verbose:
+                        print(f"  [{n}/{total_mfapi}] OK  {fund_name[:55]}")
+                        print(f"       -> [{code}] {(matched_name or '')[:55]}")
+                else:
+                    if verbose:
+                        print(f"  [{n}/{total_mfapi}] NO  {fund_name[:55]} -- no match")
+    if verbose:
+        print(f"[resolver] Done. {len(resolved)}/{total_missing} resolved "
+              f"({override_count} overrides + {len(resolved)-override_count-corrected_existing} mfapi"
+              f"{f', {corrected_existing} corrected existing codes' if corrected_existing else ''}).")
+    return rows, resolved

src/weightage.py ADDED Viewed

	@@ -0,0 +1,425 @@

+"""
+Weightage scoring algorithm for mutual fund schemes.
+Scoring method: Sum of column weights where cell qualifies for Light Green (Top/Bottom 10)
+AND is NOT overridden by Light Red fill (threshold violations).
+Weight Distribution (Advisor-revised, March 2026):
+1.  Sortino Ratio:        1.300  (Top 10, higher is better)
+2.  Sharpe Ratio:         1.200  (Top 10, higher is better)
+3.  Information Ratio:    1.000  (Top 10, higher is better, Light Red if < 0)
+4.  Alpha:                1.000  (Top 10, higher is better, Light Red if < 1)
+5.  Maximum Drawdown:     1.350  (Top 10, closest to 0 is better)
+6.  Down Market Capture:  1.000  (Bottom 10, lower is better)
+7.  Standard Deviation:   1.000  (Bottom 10, lower is better)
+8.  10 Years CAGR:        0.750  (Top 10, higher is better, Light Red if < Category Avg)
+9.  5 Years CAGR:         0.600  (Top 10, higher is better, Light Red if < Category Avg)
+10. 3 Years CAGR:         0.400  (Top 10, higher is better, Light Red if < Category Avg)
+11. P/E Ratio:            0.150  (Bottom 10, lower is better)
+12. TER:                  0.150  (Bottom 10, lower is better)
+13. Turnover (%):         0.100  (Bottom 10, lower is better)
+Total: 10.000
+"""
+import math
+from typing import List, Optional, Dict
+from src.models import Fund
+# ─── Weight map (Advisor-revised March 2026) ─────────────────────────────────
+WEIGHTS: Dict[str, float] = {
+    "sortino":      1.30,
+    "sharpe":       1.20,
+    "info_ratio":   1.00,
+    "alpha":        1.00,
+    "max_drawdown": 1.35,
+    "down_capture": 1.00,
+    "std_dev":      1.00,
+    "cagr_10y":     0.75,
+    "cagr_5y":      0.60,
+    "cagr_3y":      0.40,
+    "pe_ratio":     0.15,
+    "ter":          0.15,
+    "turnover":     0.10,
+}
+# Sanity-check: total should equal 10.000
+_TOTAL = round(sum(WEIGHTS.values()), 3)
+assert _TOTAL == 10.000, f"WEIGHTS do not sum to 10.000 — got {_TOTAL}"
+# Metrics where higher is better → Top 10
+TOP_10_METRICS = [
+    "sharpe", "sortino", "alpha",
+    "info_ratio", "max_drawdown",
+    "cagr_3y", "cagr_5y", "cagr_10y",
+]
+# Metrics where lower is better → Bottom 10
+BOTTOM_10_METRICS = [
+    "ter", "turnover", "std_dev",
+    "down_capture", "pe_ratio",
+]
+# Dual-condition metrics: qualifies for green AND may trigger light-red override
+DUAL_CONDITION_RULES: Dict[str, tuple] = {
+    "alpha":      ("below_value",        1),    # Light Red if alpha < 1%
+    "info_ratio": ("below_value",        0),    # Light Red if IR < 0
+    "cagr_3y":    ("below_category_avg", None), # Light Red if < category avg
+    "cagr_5y":    ("below_category_avg", None),
+    "cagr_10y":   ("below_category_avg", None),
+}
+# ─── Value helpers ────────────────────────────────────────────────────────────
+def _is_valid(v) -> bool:
+    """True if v is a real, non-zero, non-NaN number."""
+    if v is None:
+        return False
+    if isinstance(v, float) and (v != v):   # NaN check
+        return False
+    # 0.0 is treated as missing/not-applicable for risk metrics
+    if v == 0:
+        return False
+    return True
+def _is_valid_drawdown(v) -> bool:
+    """
+    For Maximum Drawdown specifically: 0.0 is a genuine data-quality gap
+    (overnight/liquid funds sometimes publish 0 when the real figure was never
+    fetched).  Treat 0 as invalid so that only funds with a real (negative)
+    drawdown value compete in the ranking.
+    """
+    if v is None:
+        return False
+    if isinstance(v, float) and v != v:     # NaN
+        return False
+    if v == 0:
+        return False   # ← exact zero excluded; see drawdown_zero_fix() below
+    return True
+# ─── Ranking helpers ──────────────────────────────────────────────────────────
+def _top_n(fund: Fund, peers: List[Fund], metric: str, n: int = 10) -> bool:
+    """
+    Return True if fund is in the top-N (highest values) for metric.
+    Special case:
+      - For Information Ratio we allow a value of exactly 0.0 to participate
+        in ranking (Excel treats 0 as a valid value; only < 0 is "red").
+    """
+    fund_val = getattr(fund, metric, None)
+    def _valid_for_rank(v):
+        if metric == "info_ratio":
+            # Treat 0 as a real value; only None/NaN are invalid here.
+            if v is None:
+                return False
+            if isinstance(v, float) and (v != v):
+                return False
+            return True
+        return _is_valid(v)
+    if not _valid_for_rank(fund_val):
+        return False
+    valid = [getattr(f, metric, None) for f in peers
+             if _valid_for_rank(getattr(f, metric, None))]
+    if len(valid) < 2:
+        return False
+    # Match Excel's TOP 10 conditional formatting:
+    # "Top N items", with N capped at the number of valid funds.
+    effective_n = min(n, len(valid))
+    valid.sort(reverse=True)
+    return fund_val >= valid[effective_n - 1]
+def _top_n_drawdown(fund: Fund, peers: List[Fund], n: int = 10) -> bool:
+    """
+    Special top-N for Maximum Drawdown.
+    "Closest to 0" = highest value among negatives.
+    -5% is better than -20%, so we still sort descending.
+    Only non-zero, non-None values participate (see _is_valid_drawdown).
+    Uses strict-N (no 50% fallback) so a single liquid fund with a real
+    drawdown doesn't accidentally qualify just because of category size.
+    """
+    fund_val = getattr(fund, "max_drawdown", None)
+    if not _is_valid_drawdown(fund_val):
+        return False
+    valid = [getattr(f, "max_drawdown", None) for f in peers
+             if _is_valid_drawdown(getattr(f, "max_drawdown", None))]
+    if not valid:
+        return False
+    effective_n = min(n, len(valid))
+    valid.sort(reverse=True)           # -5 > -20  →  -5 is rank-1
+    return fund_val >= valid[effective_n - 1]
+def _bottom_n(fund: Fund, peers: List[Fund], metric: str, n: int = 10) -> bool:
+    """Return True if fund is in the bottom-N (lowest values) for metric."""
+    fund_val = getattr(fund, metric, None)
+    if not _is_valid(fund_val):
+        return False
+    valid = [getattr(f, metric, None) for f in peers
+             if _is_valid(getattr(f, metric, None))]
+    if len(valid) < 2:
+        return False
+    # Match Excel's BOTTOM 10 conditional formatting:
+    # "Bottom N items", with N capped at the number of valid funds.
+    effective_n = min(n, len(valid))
+    valid.sort()
+    return fund_val <= valid[effective_n - 1]
+def _category_avg(peers: List[Fund], metric: str) -> Optional[float]:
+    """Arithmetic mean of valid metric values across peers."""
+    vals = [getattr(f, metric, None) for f in peers
+            if _is_valid(getattr(f, metric, None))]
+    return sum(vals) / len(vals) if vals else None
+def _light_red(fund: Fund, metric: str, cat_avg: Optional[float]) -> bool:
+    """Return True if the metric triggers a Light Red override for this fund."""
+    if metric not in DUAL_CONDITION_RULES:
+        return False
+    rule_type, threshold = DUAL_CONDITION_RULES[metric]
+    val = getattr(fund, metric, None)
+    if not _is_valid(val):
+        return False
+    if rule_type == "below_value":
+        return val < threshold
+    if rule_type == "below_category_avg":
+        return (cat_avg is not None) and (val < cat_avg)
+    return False
+# ─── Drawdown zero-cell fix ───────────────────────────────────────────────────
+def drawdown_zero_fix(
+    funds: List[Fund],
+    *,
+    verbose: bool = True,
+) -> int:
+    """
+    Detect funds whose max_drawdown is exactly 0 (data-quality gap) and
+    recompute it from live NAV history via the NAV engine.
+    Strategy
+    --------
+    1. Collect every fund where max_drawdown == 0 AND the fund has a
+       scheme_code (stored in fund.name as a fallback lookup key via CSV).
+       In practice the scheme_code lives in the CSV row; the data_engine
+       should pass it through.  We look for it on fund.fill_status
+       (which sometimes carries audit tags) or via a side-channel dict
+       passed in by the caller.  Most robustly, callers should set
+       fund.fill_status = "DRAWDOWN_ZERO" before calling this function,
+       OR we scan all funds whose max_drawdown is 0.
+    2. For each such fund, call compute_nav_metrics_for_scheme() requesting
+       only ["Maximum Drawdown"].
+    3. If a real negative value comes back, write it to fund.max_drawdown.
+    Returns the count of cells successfully fixed.
+    NOTE: This function requires network access (mfapi.in + yfinance).
+          It is intentionally separated from compute_scores() so callers
+          can opt in only when enrichment is desired.
+    """
+    # Import here to avoid circular dependency at module level
+    try:
+        from src.nav_metrics_engine import NavEngineCache, compute_nav_metrics_for_scheme
+    except ImportError:
+        if verbose:
+            print("[drawdown_fix] nav_metrics_engine not available — skipping.")
+        return 0
+    # Build a name → scheme_code map from fund.fill_status field
+    # (data_engine stores scheme codes in fill_status for audit; adjust if needed)
+    # Fallback: use the fund name itself as a best-effort search key.
+    DEBT_PREFIXES = ("debt", "liquid", "overnight", "money market", "gilt",
+                    "fixed maturity", "interval", "fmp")
+    from datetime import datetime as _dt
+    _now = _dt.now()
+    def _fund_age_years(f) -> float | None:
+        ld = getattr(f, "_launch_date", None)
+        if not isinstance(ld, _dt):
+            return None
+        return (_now - ld).days / 365.25
+    # Import the set of funds already attempted by csv_enrichment NAV phase
+    try:
+        from src.csv_enrichment import _NAV_ATTEMPTED_FUNDS as _nav_attempted
+    except Exception:
+        _nav_attempted = set()
+    zero_funds = [
+        f for f in funds
+        if (
+            # Only target funds where drawdown is truly missing (0 or None)
+            (f.max_drawdown == 0 or f.max_drawdown is None)
+            # AND only equity/hybrid — debt funds have tiny/no drawdown, skip them
+            and not any(f.category.lower().startswith(pfx) for pfx in DEBT_PREFIXES)
+            # AND fund must be ≥3 years old — younger funds can't have 3Y NAV history
+            and (_fund_age_years(f) is None or _fund_age_years(f) >= 3.0)
+            # AND skip funds already attempted by csv_enrichment NAV phase —
+            # if enrichment couldn't fill MDD, a second pass won't either
+            and f.name not in _nav_attempted
+        )
+    ]
+    if not zero_funds:
+        if verbose:
+            print("[drawdown_fix] No zero/missing drawdown cells found.")
+        return 0
+    if verbose:
+        print(f"[drawdown_fix] Attempting to fix {len(zero_funds)} drawdown cells …")
+    from concurrent.futures import ThreadPoolExecutor, as_completed as _as_completed
+    import threading as _threading
+    # Bulk-preload cache before parallel workers start (2 SQL queries instead of N)
+    try:
+        from src.nav_metrics_engine import _bulk_preload_cache, resolve_benchmark_ticker
+        _scheme_codes  = [getattr(f, "_scheme_code", None) or "" for f in zero_funds]
+        _bench_tickers = [resolve_benchmark_ticker(getattr(f, "benchmark", "") or "") for f in zero_funds]
+        _bulk_preload_cache(_scheme_codes, _bench_tickers)
+    except Exception:
+        pass  # graceful degradation — workers will fall back to per-query
+    cache = NavEngineCache()
+    fixed = 0
+    _lock = _threading.Lock()
+    with_code = [
+        (f, getattr(f, "_scheme_code", None) or "", getattr(f, "benchmark", "") or "")
+        for f in zero_funds
+        if (getattr(f, "_scheme_code", None) or "").strip()
+    ]
+    no_code = [f for f in zero_funds if not (getattr(f, "_scheme_code", None) or "").strip()]
+    if verbose:
+        for f in no_code:
+            print(f"  SKIP  {f.name[:55]} — no scheme code available")
+    def _fix_one(args):
+        fund, scheme_code, benchmark = args
+        metrics, skip = compute_nav_metrics_for_scheme(
+            scheme_code=scheme_code,
+            benchmark_type=benchmark,
+            needed_metrics=["Maximum Drawdown"],
+            cache=cache,
+        )
+        mdd    = metrics.get("Maximum Drawdown")
+        reason = skip.get("Maximum Drawdown", "unknown")
+        return fund, mdd, reason
+    with ThreadPoolExecutor(max_workers=12) as executor:
+        futures = {executor.submit(_fix_one, item): item for item in with_code}
+        for fut in _as_completed(futures):
+            try:
+                fund, mdd, reason = fut.result()
+            except Exception as e:
+                continue
+            if mdd is not None and mdd != 0:
+                with _lock:
+                    fund.max_drawdown = mdd
+                    fixed += 1
+                if verbose:
+                    print(f"  FIXED {fund.name[:55]}  →  MDD = {mdd:.3f}%")
+            else:
+                if verbose:
+                    print(f"  MISS  {fund.name[:55]} — {reason}")
+    if verbose:
+        print(f"[drawdown_fix] Done. Fixed {fixed}/{len(zero_funds)} cells.")
+    return fixed
+# ─── Main scoring engine ──────────────────────────────────────────────────────
+def compute_scores(funds: List[Fund]) -> List[Fund]:
+    """
+    Score and rank all funds within their categories.
+    Algorithm
+    ---------
+    For every metric that carries a weight:
+      1. Check if the fund is in Top-N or Bottom-N (as appropriate) within
+         its category peer group  → "Light Green"
+      2. If Light Green AND a dual-condition rule fires         → "Light Red"
+         override: weight contribution = 0
+      3. Otherwise if Light Green and NOT Light Red             → add weight
+    fund.score is capped at 10.0 (model scale).
+    Also sets:
+      fund.rank_in_category   – 1 = best within category
+      fund.is_top_quartile    – True for top ⌈N/4⌉ funds
+    Returns the same list (mutated in-place) for convenience.
+    """
+    # Group by category
+    categories: Dict[str, List[Fund]] = {}
+    for fund in funds:
+        categories.setdefault(fund.category, []).append(fund)
+    for cat_name, cat_funds in categories.items():
+        # Pre-compute category averages for CAGR dual-condition rules
+        cat_averages = {
+            metric: _category_avg(cat_funds, metric)
+            for metric in ("cagr_3y", "cagr_5y", "cagr_10y")
+        }
+        for fund in cat_funds:
+            score = 0.0
+            for metric, weight in WEIGHTS.items():
+                is_green = False
+                # ── Green check ──────────────────────────────────────────
+                if metric == "max_drawdown":
+                    is_green = _top_n_drawdown(fund, cat_funds)
+                elif metric in TOP_10_METRICS:
+                    is_green = _top_n(fund, cat_funds, metric)
+                elif metric in BOTTOM_10_METRICS:
+                    is_green = _bottom_n(fund, cat_funds, metric)
+                # ── Light Red override ───────────────────────────────────
+                if is_green and metric in DUAL_CONDITION_RULES:
+                    cat_avg = cat_averages.get(metric)
+                    if _light_red(fund, metric, cat_avg):
+                        is_green = False   # zeroed by override
+                if is_green:
+                    score += weight
+            fund.score = round(min(score, 10.0), 3)
+        # ── Rank within category ─────────────────────────────────────────
+        sorted_funds = sorted(
+            cat_funds,
+            key=lambda f: (-(f.score or 0), (f.name or "").lower(), getattr(f, "order", 0)),
+        )
+        top_quartile_cutoff = max(1, math.ceil(len(sorted_funds) / 4))
+        for rank, fund in enumerate(sorted_funds, start=1):
+            fund.rank_in_category = rank
+            fund.is_top_quartile  = (rank <= top_quartile_cutoff)
+    return funds