Parthiban97 commited on
Commit
b0e15c1
Β·
verified Β·
1 Parent(s): 35d4c56

Upload 15 files

Browse files
.streamlit/config.toml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ [theme]
2
+ base = "dark"
3
+ primaryColor = "#4A90E2"
4
+ backgroundColor = "#0f0f0f"
5
+ secondaryBackgroundColor = "#1a1a1a"
6
+ textColor = "#e5e5e5"
7
+ font = "sans serif"
app.py ADDED
@@ -0,0 +1,509 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import io
4
+ import sys
5
+ import tempfile
6
+ import time
7
+ import traceback
8
+ from contextlib import redirect_stderr, redirect_stdout
9
+ from datetime import datetime
10
+ from pathlib import Path
11
+ from typing import Any
12
+
13
+ import streamlit as st
14
+
15
+ from src.csv_enrichment import (
16
+ TARGET_COLUMNS,
17
+ EnrichmentConfig,
18
+ enrich_csv, # use canonical name (alias also works)
19
+ lookup_fund_metric_value,
20
+ )
21
+ from src.data_engine import run_data_engine
22
+
23
+
24
+ # ── Session logging ───────────────────────────────────────────────────────────
25
+
26
+ def _init_session_log() -> Path:
27
+ if "session_log_path" not in st.session_state:
28
+ log_dir = Path("logs") / "streamlit_sessions"
29
+ log_dir.mkdir(parents=True, exist_ok=True)
30
+ stamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
31
+ log_path = log_dir / f"session_{stamp}.log"
32
+ log_path.write_text(
33
+ f"[{datetime.now().isoformat()}] session_started\n",
34
+ encoding="utf-8",
35
+ )
36
+ st.session_state["session_log_path"] = str(log_path)
37
+ return Path(st.session_state["session_log_path"])
38
+
39
+
40
+ def _log_session_event(message: str) -> None:
41
+ try:
42
+ log_path = _init_session_log()
43
+ with log_path.open("a", encoding="utf-8") as f:
44
+ f.write(f"[{datetime.now().isoformat()}] {message}\n")
45
+ except Exception:
46
+ pass
47
+
48
+
49
+ def _log_session_block(title: str, content: str) -> None:
50
+ try:
51
+ log_path = _init_session_log()
52
+ with log_path.open("a", encoding="utf-8") as f:
53
+ f.write(f"[{datetime.now().isoformat()}] --- {title} (start) ---\n")
54
+ f.write((content.rstrip() + "\n") if content.strip() else "(no output)\n")
55
+ f.write(f"[{datetime.now().isoformat()}] --- {title} (end) ---\n")
56
+ except Exception:
57
+ pass
58
+
59
+
60
+ # ── Captured output runner ────────────────────────────────────────────────────
61
+
62
+ def _run_with_captured_output(func: Any, *args: Any, **kwargs: Any) -> tuple[Any, str]:
63
+ """Run function, mirror prints to terminal, capture for UI display."""
64
+
65
+ class _TeeCapture(io.TextIOBase):
66
+ def __init__(self, mirror: Any, on_write: Any = None) -> None:
67
+ self._mirror = mirror
68
+ self._buffer = io.StringIO()
69
+ self._on_write = on_write
70
+
71
+ def write(self, s: str) -> int:
72
+ text = str(s)
73
+ self._buffer.write(text)
74
+ try:
75
+ self._mirror.write(text)
76
+ self._mirror.flush()
77
+ except Exception:
78
+ pass
79
+ if self._on_write is not None:
80
+ try:
81
+ self._on_write(text)
82
+ except Exception:
83
+ pass
84
+ return len(text)
85
+
86
+ def flush(self) -> None:
87
+ try:
88
+ self._mirror.flush()
89
+ except Exception:
90
+ pass
91
+
92
+ def getvalue(self) -> str:
93
+ return self._buffer.getvalue()
94
+
95
+ live_callback = kwargs.pop("live_callback", None)
96
+ out_tee = _TeeCapture(sys.__stdout__, live_callback)
97
+ err_tee = _TeeCapture(sys.__stderr__, live_callback)
98
+ with redirect_stdout(out_tee), redirect_stderr(err_tee):
99
+ result = func(*args, **kwargs)
100
+ return result, out_tee.getvalue() + err_tee.getvalue()
101
+
102
+
103
+ # ── CSS ───────────────────────────────────────────────────────────────────────
104
+
105
+ def _inject_custom_css() -> None:
106
+ st.markdown(
107
+ """
108
+ <style>
109
+ :root {
110
+ --mf-primary: #4A90E2;
111
+ --mf-accent: #22c55e;
112
+ --mf-bg: #0f0f0f;
113
+ --mf-bg-secondary: #1a1a1a;
114
+ --mf-surface: #1a1a1a;
115
+ --mf-text: #e5e5e5;
116
+ --mf-text-muted: #a0a0a0;
117
+ --mf-border: #333333;
118
+ }
119
+ .mf-shell { max-width: 1100px; margin: 0 auto; padding: 0 0 3rem 0; }
120
+ .mf-hero {
121
+ padding: 1.9rem 2.1rem 1.5rem 2.1rem;
122
+ border-radius: 18px;
123
+ background: var(--mf-bg-secondary);
124
+ border: 1px solid var(--mf-border);
125
+ }
126
+ .mf-kicker {
127
+ letter-spacing: .16em; font-size: 0.75rem;
128
+ text-transform: uppercase; color: var(--mf-primary); margin-bottom: 0.5rem;
129
+ }
130
+ .mf-title {
131
+ font-size: 2.2rem; font-weight: 650;
132
+ line-height: 1.1; color: var(--mf-text); margin-bottom: 0.75rem;
133
+ }
134
+ .mf-subtitle { max-width: 40rem; font-size: 0.95rem; color: var(--mf-text-muted); }
135
+ .mf-panel {
136
+ margin-top: 1.75rem; padding: 1.5rem 1.75rem 1.75rem 1.75rem;
137
+ border-radius: 20px; background: var(--mf-surface);
138
+ border: 1px solid var(--mf-border);
139
+ }
140
+ .mf-helper { font-size: 0.8rem; color: var(--mf-text-muted); margin-bottom: 0.9rem; }
141
+ .mf-steps { font-size: 0.78rem; color: var(--mf-text-muted); margin-top: 0.3rem; }
142
+ .mf-steps li { margin-bottom: 0.1rem; }
143
+ .mf-metrics { display: flex; flex-wrap: wrap; gap: 0.75rem; margin-top: 1.25rem; }
144
+ .mf-metric {
145
+ flex: 0 0 auto; min-width: 140px; padding: 0.6rem 0.8rem;
146
+ border-radius: 0.9rem; border: 1px solid var(--mf-border);
147
+ background: var(--mf-bg-secondary);
148
+ }
149
+ .mf-metric-label {
150
+ font-size: 0.72rem; text-transform: uppercase;
151
+ letter-spacing: 0.09em; color: var(--mf-text-muted); margin-bottom: 0.2rem;
152
+ }
153
+ .mf-metric-value { font-size: 1.05rem; font-weight: 600; color: var(--mf-accent); }
154
+ .mf-timing {
155
+ margin-top: 1rem; padding: 0.75rem 1rem;
156
+ border-radius: 0.75rem; border: 1px solid var(--mf-border);
157
+ background: var(--mf-bg-secondary); font-size: 0.8rem;
158
+ color: var(--mf-text-muted);
159
+ }
160
+ .mf-download-label {
161
+ font-size: 0.8rem; color: var(--mf-text-muted);
162
+ margin-top: 1.4rem; margin-bottom: 0.35rem;
163
+ }
164
+ .stFileUploader div[data-testid="stFileUploaderDropzone"] {
165
+ border-radius: 0.9rem; border-color: var(--mf-border);
166
+ background: var(--mf-bg-secondary);
167
+ }
168
+ .stButton > button[kind="primary"], .stDownloadButton > button {
169
+ border-radius: 0.5rem; border: none;
170
+ background: var(--mf-primary) !important;
171
+ color: white !important; font-weight: 600;
172
+ }
173
+ .stApp, [data-testid="stAppViewContainer"] { background-color: var(--mf-bg); }
174
+ .block-container { padding-top: 1.5rem; }
175
+ @media (max-width: 768px) {
176
+ .mf-hero { padding: 1.4rem 1.3rem 1.2rem 1.3rem; }
177
+ .mf-title { font-size: 1.6rem; }
178
+ }
179
+ </style>
180
+ """,
181
+ unsafe_allow_html=True,
182
+ )
183
+
184
+
185
+ # ── Main ──────────────────────────────────────────────────────────────────────
186
+
187
+ def main() -> None:
188
+ st.set_page_config(
189
+ page_title="MF Scoring Engine Β· Advisor Demo",
190
+ page_icon="πŸ“ˆ",
191
+ layout="centered",
192
+ )
193
+
194
+ _inject_custom_css()
195
+ _init_session_log()
196
+ _log_session_event("app_rendered")
197
+
198
+ st.markdown('<div class="mf-shell">', unsafe_allow_html=True)
199
+
200
+ st.markdown(
201
+ """
202
+ <section class="mf-hero">
203
+ <div class="mf-kicker">Advisor tool</div>
204
+ <div class="mf-title">Score your mutual fund list in Excel.</div>
205
+ <p class="mf-subtitle">
206
+ Upload your mutual fund CSV. The app runs enrichment (NAV engine β†’ web fallback β†’ median),
207
+ scores every fund, and gives you a ready-to-share Excel workbook.
208
+ </p>
209
+ </section>
210
+ """,
211
+ unsafe_allow_html=True,
212
+ )
213
+
214
+ st.markdown('<section class="mf-panel">', unsafe_allow_html=True)
215
+
216
+ tab_run, tab_about = st.tabs(["Run analysis", "How scoring works"])
217
+
218
+ with tab_run:
219
+ st.markdown("### Upload CSV & generate workbook")
220
+ st.markdown(
221
+ """
222
+ <p class="mf-helper">
223
+ Upload your standard fund universe CSV
224
+ (<code>Fund</code>, <code>Benchmark Type</code>, CAGR columns, etc.).<br>
225
+ <strong>Firecrawl/Tavily is used only for missing P/E and P/B</strong> β€”
226
+ all risk metrics (Alpha, Sharpe, Sortino, etc.) are computed directly from NAV history.
227
+ </p>
228
+ """,
229
+ unsafe_allow_html=True,
230
+ )
231
+
232
+ uploaded_file = st.file_uploader(
233
+ "Step 1 Β· Upload fund universe CSV",
234
+ type=["csv"],
235
+ help="Same CSV you feed into the offline data engine.",
236
+ )
237
+ if uploaded_file is not None:
238
+ st.caption(
239
+ f"Selected: **{uploaded_file.name}** Β· "
240
+ f"{(len(uploaded_file.getbuffer()) / 1024):.1f} KB"
241
+ )
242
+ _log_session_event(
243
+ f"uploaded_file name={uploaded_file.name} "
244
+ f"size_kb={(len(uploaded_file.getbuffer())/1024):.1f}"
245
+ )
246
+
247
+ st.info(
248
+ "Pipeline: **Scheme code resolution β†’ NAV engine (parallel, 12 workers) "
249
+ "β†’ PE/PB web lookup β†’ category median fallback β†’ scoring engine**"
250
+ )
251
+
252
+ st.markdown(
253
+ """
254
+ <ul class="mf-steps">
255
+ <li>1 β€” Upload your latest CSV export.</li>
256
+ <li>2 β€” Click <strong>Run analysis</strong> and watch live logs.</li>
257
+ <li>3 β€” Download the scored Excel when complete.</li>
258
+ </ul>
259
+ """,
260
+ unsafe_allow_html=True,
261
+ )
262
+
263
+ run_clicked = st.button(
264
+ "Step 2 Β· Run analysis",
265
+ type="primary",
266
+ use_container_width=True,
267
+ disabled=uploaded_file is None,
268
+ )
269
+
270
+ # ── State carried across rerun ─────────────────────────────────────
271
+ generated_bytes: io.BytesIO | None = None
272
+ generated_filename: str | None = None
273
+ funds_count: int | None = None
274
+ categories_count: int | None = None
275
+ enrichment_summary: str | None = None
276
+ timing_html: str | None = None
277
+
278
+ if run_clicked:
279
+ _log_session_event("run_analysis_clicked")
280
+
281
+ if uploaded_file is None:
282
+ st.warning("Please upload a CSV file first.")
283
+ _log_session_event("run_aborted_no_upload")
284
+ else:
285
+ base_stem = Path(uploaded_file.name).stem
286
+ stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
287
+ input_stem = f"{base_stem}_{stamp}"
288
+
289
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
290
+ tmp.write(uploaded_file.getbuffer())
291
+ input_path = Path(tmp.name)
292
+
293
+ out_dir = Path("output")
294
+ out_dir.mkdir(exist_ok=True)
295
+ generated_path = out_dir / f"fund_analysis_{input_stem}.xlsx"
296
+
297
+ t_total_start = time.perf_counter()
298
+
299
+ try:
300
+ with st.status("Processing…", expanded=True) as status:
301
+ live_lines: list[str] = []
302
+ live_box = st.empty()
303
+
304
+ # Noise patterns to suppress from the live log box
305
+ _SUPPRESS = (
306
+ "missing ScriptRunContext",
307
+ "FutureWarning",
308
+ "Passing literal json",
309
+ "To read from a literal string",
310
+ "return pd.read_json",
311
+ )
312
+
313
+ def _live_sink(chunk: str) -> None:
314
+ clean = chunk.replace("\r", "")
315
+ new = [
316
+ ln for ln in clean.split("\n")
317
+ if ln.strip()
318
+ and not any(s in ln for s in _SUPPRESS)
319
+ ]
320
+ if not new:
321
+ return
322
+ live_lines.extend(new)
323
+ if len(live_lines) > 50:
324
+ del live_lines[:-50]
325
+ live_box.code("\n".join(live_lines), language="text")
326
+
327
+ # ── Phase 1: Enrichment ────────────────────────────
328
+ st.write("**1/2 Enrichment** β€” scheme codes β†’ NAV engine β†’ PE/PB β†’ medians…")
329
+ t_enrich_start = time.perf_counter()
330
+
331
+ enrichment, enrich_output = _run_with_captured_output(
332
+ enrich_csv,
333
+ str(input_path),
334
+ config=EnrichmentConfig(
335
+ enabled=True,
336
+ max_cells=None,
337
+ min_confidence=0.65,
338
+ resolve_scheme_codes=True, # ← parallel scheme resolution
339
+ enable_nav_engine=True, # ← parallel NAV engine (12 workers)
340
+ web_search_pe_pb_only=True, # ← only PE/PB uses API credits
341
+ impute_unresolved=True,
342
+ ),
343
+ live_callback=_live_sink,
344
+ )
345
+
346
+ t_enrich_end = time.perf_counter()
347
+ enrich_secs = t_enrich_end - t_enrich_start
348
+
349
+ _log_session_block("enrichment_output", enrich_output)
350
+ _log_session_event(
351
+ f"enrichment_done "
352
+ f"checked={enrichment.examined_cells} "
353
+ f"nav={enrichment.nav_cells} "
354
+ f"web={enrichment.web_cells} "
355
+ f"imputed={enrichment.imputed_cells} "
356
+ f"skipped={enrichment.skipped_cells} "
357
+ f"codes={enrichment.resolved_codes} "
358
+ f"secs={enrich_secs:.1f}"
359
+ )
360
+
361
+ st.write(
362
+ f" βœ… Enrichment done in **{enrich_secs:.0f}s** β€” "
363
+ f"checked {enrichment.examined_cells} cells, "
364
+ f"NAV filled {enrichment.nav_cells}, "
365
+ f"web filled {enrichment.web_cells}, "
366
+ f"imputed {enrichment.imputed_cells}"
367
+ )
368
+
369
+ pipeline_input_path = Path(enrichment.enriched_csv_path)
370
+
371
+ # ── Phase 2: Scoring + Excel ───────────────────────
372
+ st.write("**2/2 Scoring engine** β€” computing scores, ranking, generating Excel…")
373
+ t_engine_start = time.perf_counter()
374
+
375
+ funds, engine_output = _run_with_captured_output(
376
+ run_data_engine,
377
+ csv_path=str(pipeline_input_path),
378
+ output_path=str(generated_path),
379
+ use_comprehensive_scoring=True,
380
+ live_callback=_live_sink,
381
+ )
382
+
383
+ t_engine_end = time.perf_counter()
384
+ engine_secs = t_engine_end - t_engine_start
385
+ total_secs = time.perf_counter() - t_total_start
386
+
387
+ _log_session_block("engine_output", engine_output)
388
+ _log_session_event(
389
+ f"engine_done funds={len(funds)} "
390
+ f"secs={engine_secs:.1f} total={total_secs:.1f}"
391
+ )
392
+
393
+ st.write(
394
+ f" βœ… Scoring done in **{engine_secs:.0f}s** β€” "
395
+ f"{len(funds)} funds scored"
396
+ )
397
+
398
+ status.update(
399
+ label=f"βœ… Complete β€” {total_secs:.0f}s total",
400
+ state="complete",
401
+ expanded=False,
402
+ )
403
+
404
+ except Exception as exc:
405
+ err_text = "".join(traceback.format_exception(exc))
406
+ _log_session_block("run_failure", err_text)
407
+ _log_session_event(f"run_failed error={exc}")
408
+ st.error("Run failed. See terminal for traceback.")
409
+ st.code(err_text, language="text")
410
+ return
411
+
412
+ # ── Summary ────────────────────────────────────────────────
413
+ if enrichment.errors:
414
+ st.warning("Enrichment completed with warnings β€” check scratchpad for details.")
415
+ if enrichment.scratchpad_path:
416
+ st.caption(f"Scratchpad: `{enrichment.scratchpad_path}`")
417
+
418
+ enrichment_summary = (
419
+ f"Enrichment: {enrichment.examined_cells} cells checked β€” "
420
+ f"NAV filled {enrichment.nav_cells}, "
421
+ f"web filled {enrichment.web_cells}, "
422
+ f"imputed {enrichment.imputed_cells}, "
423
+ f"skipped {enrichment.skipped_cells}."
424
+ )
425
+
426
+ timing_html = (
427
+ f'<div class="mf-timing">'
428
+ f'⏱ Enrichment: <strong>{enrich_secs:.0f}s</strong> &nbsp;|&nbsp; '
429
+ f'Scoring: <strong>{engine_secs:.0f}s</strong> &nbsp;|&nbsp; '
430
+ f'Total: <strong>{total_secs:.0f}s ({total_secs/60:.1f} min)</strong>'
431
+ f"{'&nbsp; 🎯 Under 3 min!' if total_secs < 180 else ''}"
432
+ f'</div>'
433
+ )
434
+
435
+ with generated_path.open("rb") as f:
436
+ generated_bytes = io.BytesIO(f.read())
437
+ generated_filename = generated_path.name
438
+ funds_count = len(funds)
439
+ categories_count = len({f.category for f in funds})
440
+
441
+ st.success("Step 3 Β· Excel ready β€” download below.")
442
+ if enrichment_summary:
443
+ st.info(enrichment_summary)
444
+
445
+ # ── Download area (persists after rerun) ──────────────────────────
446
+ if generated_bytes and generated_filename:
447
+
448
+ if timing_html:
449
+ st.markdown(timing_html, unsafe_allow_html=True)
450
+
451
+ st.markdown(
452
+ """
453
+ <div class="mf-metrics">
454
+ <div class="mf-metric">
455
+ <div class="mf-metric-label">Schemes scored</div>
456
+ <div class="mf-metric-value">{funds_count}</div>
457
+ </div>
458
+ <div class="mf-metric">
459
+ <div class="mf-metric-label">Categories</div>
460
+ <div class="mf-metric-value">{categories_count}</div>
461
+ </div>
462
+ <div class="mf-metric">
463
+ <div class="mf-metric-label">Output format</div>
464
+ <div class="mf-metric-value">Excel (.xlsx)</div>
465
+ </div>
466
+ </div>
467
+ """.format(
468
+ funds_count=funds_count or 0,
469
+ categories_count=categories_count or 0,
470
+ ),
471
+ unsafe_allow_html=True,
472
+ )
473
+
474
+ st.markdown(
475
+ '<div class="mf-download-label">Download the scored workbook:</div>',
476
+ unsafe_allow_html=True,
477
+ )
478
+ st.download_button(
479
+ label="⬇️ Download processed Excel",
480
+ data=generated_bytes.getvalue(),
481
+ file_name=generated_filename,
482
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
483
+ use_container_width=True,
484
+ )
485
+
486
+ with tab_about:
487
+ st.markdown("### What the pipeline does")
488
+ st.markdown(
489
+ """
490
+ | Phase | What happens |
491
+ |---|---|
492
+ | **0 β€” Scheme resolution** | Parallel fuzzy-match of missing AMFI scheme codes (8 threads) |
493
+ | **1 β€” NAV engine** | Trailing 3Y risk metrics computed from mfapi NAV history (12 threads) |
494
+ | **2 β€” PE/PB web search** | Tavily (primary) or Firecrawl (fallback) β€” only for missing P/E and P/B |
495
+ | **3 β€” Median impute** | Category median fills remaining gaps for young/NA funds |
496
+ | **4 β€” Scoring** | Top/Bottom 10 per category, 10-point weighted model |
497
+ | **5 β€” Excel export** | Conditional formatting, quartile bands, benchmark rows |
498
+
499
+ **Cache**: NAV history is cached in Neon (production) or SQLite (local) with a 7-day TTL.
500
+ Second runs are near-instant for cached funds.
501
+ """
502
+ )
503
+
504
+ st.markdown("</section>", unsafe_allow_html=True)
505
+ st.markdown("</div>", unsafe_allow_html=True)
506
+
507
+
508
+ if __name__ == "__main__":
509
+ main()
requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pandas>=2.0.0
2
+ openpyxl>=3.1.0
3
+ reportlab>=4.0.0
4
+ matplotlib>=3.7.0
5
+ numpy>=1.24.0
6
+ click>=8.1.0
7
+ streamlit>=1.31.0
8
+ requests>=2.31.0
9
+ python-dateutil>=2.8.2
10
+ fuzzywuzzy>=0.18.0
11
+ python-Levenshtein>=0.21.0
12
+ mftool>=1.0.0
13
+ yfinance>=1.2.0
14
+ beautifulsoup4>=4.14.3
15
+ scipy>=1.17.1
16
+ lxml>=6.0.2
17
+ openai>=1.0.0
src/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Mutual Fund Portfolio Analyzer
src/charts.py ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Charts module: generates matplotlib charts for embedded use in PDF reports.
3
+ All functions return a BytesIO buffer containing a PNG image.
4
+ """
5
+
6
+ import io
7
+ import numpy as np
8
+ import matplotlib
9
+ matplotlib.use('Agg') # non-interactive backend
10
+ import matplotlib.pyplot as plt
11
+ import matplotlib.patches as mpatches
12
+ from matplotlib.figure import Figure
13
+ from typing import Dict, List, Optional
14
+
15
+ # Brand colours
16
+ BRAND_BLUE = "#1F3864"
17
+ BRAND_ACCENT = "#2E75B6"
18
+ GREENS = ["#2ECC71", "#27AE60", "#1ABC9C", "#16A085", "#52BE80"]
19
+ REDS = ["#E74C3C", "#C0392B", "#EC7063"]
20
+ PALETTE = [
21
+ "#2E75B6", "#E67E22", "#2ECC71", "#E74C3C", "#9B59B6",
22
+ "#1ABC9C", "#F39C12", "#3498DB", "#D35400", "#27AE60",
23
+ ]
24
+
25
+
26
+ def _buf(fig: Figure) -> io.BytesIO:
27
+ buf = io.BytesIO()
28
+ fig.savefig(buf, format='png', bbox_inches='tight', dpi=150)
29
+ buf.seek(0)
30
+ plt.close(fig)
31
+ return buf
32
+
33
+
34
+ def holdings_pie_chart(holdings_data: Dict[str, float], title: str = "Portfolio Allocation") -> io.BytesIO:
35
+ """
36
+ Pie chart of holdings by name β†’ value.
37
+ holdings_data: {scheme_name: current_value}
38
+ """
39
+ labels = list(holdings_data.keys())
40
+ values = list(holdings_data.values())
41
+
42
+ # Shorten long labels
43
+ short_labels = [l.split('-')[0].strip()[:22] for l in labels]
44
+
45
+ fig, ax = plt.subplots(figsize=(5, 4))
46
+ wedges, texts, autotexts = ax.pie(
47
+ values,
48
+ labels=None,
49
+ autopct='%1.1f%%',
50
+ startangle=140,
51
+ colors=PALETTE[:len(values)],
52
+ pctdistance=0.78,
53
+ )
54
+ for at in autotexts:
55
+ at.set_fontsize(7)
56
+ at.set_color("white")
57
+
58
+ ax.legend(wedges, short_labels, loc="center left", bbox_to_anchor=(1, 0.5),
59
+ fontsize=7, frameon=False)
60
+ ax.set_title(title, fontsize=10, fontweight='bold', color=BRAND_BLUE, pad=10)
61
+ fig.tight_layout()
62
+ return _buf(fig)
63
+
64
+
65
+ def sector_bar_chart(sector_data: Dict[str, float], title: str = "Sector Allocation (%)") -> io.BytesIO:
66
+ """Horizontal bar chart for sector allocation."""
67
+ if not sector_data:
68
+ sector_data = {"Data Not Available": 100}
69
+
70
+ sectors = list(sector_data.keys())
71
+ values = list(sector_data.values())
72
+
73
+ # Sort descending
74
+ pairs = sorted(zip(values, sectors), reverse=True)
75
+ values, sectors = zip(*pairs)
76
+
77
+ fig, ax = plt.subplots(figsize=(5, max(3, len(sectors) * 0.35)))
78
+ bars = ax.barh(sectors, values, color=BRAND_ACCENT, edgecolor='white', height=0.6)
79
+
80
+ for bar, val in zip(bars, values):
81
+ ax.text(bar.get_width() + 0.3, bar.get_y() + bar.get_height() / 2,
82
+ f'{val:.1f}%', va='center', fontsize=7, color='black')
83
+
84
+ ax.set_xlabel("Allocation (%)", fontsize=8, color='gray')
85
+ ax.set_title(title, fontsize=10, fontweight='bold', color=BRAND_BLUE)
86
+ ax.set_xlim(0, max(values) * 1.2)
87
+ ax.invert_yaxis()
88
+ ax.spines[['top', 'right']].set_visible(False)
89
+ ax.tick_params(axis='y', labelsize=8)
90
+ fig.tight_layout()
91
+ return _buf(fig)
92
+
93
+
94
+ def market_cap_pie(market_cap_data: Dict[str, float]) -> io.BytesIO:
95
+ """Pie chart for Large/Mid/Small/Other market cap split."""
96
+ default = {"Large Cap": 0, "Mid Cap": 0, "Small Cap": 0, "Others": 0}
97
+ data = {**default, **market_cap_data}
98
+ data = {k: v for k, v in data.items() if v > 0}
99
+
100
+ colors = {"Large Cap": "#2E75B6", "Mid Cap": "#E67E22",
101
+ "Small Cap": "#2ECC71", "Others": "#BDC3C7"}
102
+
103
+ labels = list(data.keys())
104
+ values = list(data.values())
105
+ clrs = [colors.get(l, "#95A5A6") for l in labels]
106
+
107
+ fig, ax = plt.subplots(figsize=(4, 3.5))
108
+ wedges, _, autotexts = ax.pie(
109
+ values, labels=None, autopct='%1.1f%%',
110
+ colors=clrs, startangle=90, pctdistance=0.75
111
+ )
112
+ for at in autotexts:
113
+ at.set_fontsize(8)
114
+ at.set_color("white")
115
+
116
+ ax.legend(wedges, labels, loc="lower center", bbox_to_anchor=(0.5, -0.12),
117
+ ncol=2, fontsize=8, frameon=False)
118
+ ax.set_title("Market Cap Allocation", fontsize=10, fontweight='bold', color=BRAND_BLUE)
119
+ fig.tight_layout()
120
+ return _buf(fig)
121
+
122
+
123
+ def holding_vs_benchmark_chart(
124
+ fund_name: str,
125
+ cagr_data: Dict[str, Dict[str, Optional[float]]],
126
+ ) -> io.BytesIO:
127
+ """
128
+ Bar chart comparing fund CAGR vs benchmark across time periods.
129
+ cagr_data = {
130
+ '1Y': {'fund': 12.5, 'benchmark': 14.6, 'category': 13.4},
131
+ '3Y': {...}, '5Y': {...}, '10Y': {...}
132
+ }
133
+ """
134
+ periods = list(cagr_data.keys())
135
+ fund_vals = [cagr_data[p].get('fund') or 0 for p in periods]
136
+ bm_vals = [cagr_data[p].get('benchmark') or 0 for p in periods]
137
+ cat_vals = [cagr_data[p].get('category') or 0 for p in periods]
138
+
139
+ x = np.arange(len(periods))
140
+ width = 0.25
141
+
142
+ fig, ax = plt.subplots(figsize=(5, 3.5))
143
+ b1 = ax.bar(x - width, fund_vals, width, label='Fund', color=BRAND_ACCENT, zorder=2)
144
+ b2 = ax.bar(x, bm_vals, width, label='Benchmark', color='#E67E22', zorder=2)
145
+ b3 = ax.bar(x + width, cat_vals, width, label='Category', color='#BDC3C7', zorder=2)
146
+
147
+ def label_bars(bars):
148
+ for bar in bars:
149
+ h = bar.get_height()
150
+ if h:
151
+ ax.text(bar.get_x() + bar.get_width() / 2, h + 0.2,
152
+ f'{h:.1f}', ha='center', va='bottom', fontsize=6.5)
153
+
154
+ label_bars(b1); label_bars(b2); label_bars(b3)
155
+
156
+ ax.set_xticks(x)
157
+ ax.set_xticklabels(periods, fontsize=9)
158
+ ax.set_ylabel("CAGR (%)", fontsize=8, color='gray')
159
+ ax.set_title(f"{fund_name[:30]}\nReturns vs Benchmark", fontsize=9, fontweight='bold', color=BRAND_BLUE)
160
+ ax.legend(fontsize=7, frameon=False)
161
+ ax.spines[['top', 'right']].set_visible(False)
162
+ ax.yaxis.grid(True, linestyle='--', alpha=0.5, zorder=0)
163
+ ax.set_axisbelow(True)
164
+ fig.tight_layout()
165
+ return _buf(fig)
166
+
167
+
168
+ def quartile_analysis_grid(holdings_data: list) -> io.BytesIO:
169
+ """
170
+ Quartile Analysis Grid β€” based on the senior's handwritten sketch.
171
+
172
+ Layout (matching sketch exactly):
173
+ Columns : 1Y | 3Y | 5Y | 10Y
174
+ For each holding, show 3 rows:
175
+ BM : Benchmark CAGR value for each period
176
+ Cat : Category Average CAGR for each period
177
+ Scheme: Fund CAGR + Quartile (Q1/Q2/Q3/Q4) β€” color-coded
178
+
179
+ holdings_data: list of dicts, each with keys:
180
+ scheme_name, rank_in_category, total_in_category,
181
+ cagr_1y/_bm/_cat, cagr_3y/_bm/_cat, cagr_5y/_bm/_cat, cagr_10y/_bm/_cat
182
+ """
183
+ PERIODS = ["1Y", "3Y", "5Y", "10Y"]
184
+ PERIOD_KEYS = ["1y", "3y", "5y", "10y"]
185
+ ROW_LABELS = ["BM", "Cat", "Scheme"]
186
+
187
+ Q_COLORS = {1: "#90EE90", 2: "#BDD7EE", 3: "#FFD580", 4: "#FFB3B3"}
188
+ HEADER_CLR = "#1F3864"
189
+ BM_CLR = "#D6E4F0"
190
+ CAT_CLR = "#EBF5FB"
191
+
192
+ def get_quartile(rank, total):
193
+ if not rank or not total or total == 0:
194
+ return 4
195
+ pct = rank / total
196
+ if pct <= 0.25: return 1
197
+ if pct <= 0.50: return 2
198
+ if pct <= 0.75: return 3
199
+ return 4
200
+
201
+ def fmt(v):
202
+ if v is None: return "–"
203
+ try: return f"{float(v):.1f}%"
204
+ except: return "–"
205
+
206
+ n_holdings = len(holdings_data)
207
+ rows_per = 3 # BM, Cat, Scheme
208
+ n_rows = n_holdings * rows_per + 1 # +1 for header row
209
+ n_cols = 5 # Label + 4 periods
210
+
211
+ fig_h = max(4.5, 0.5 * n_rows + 1.5)
212
+ fig, ax = plt.subplots(figsize=(10, fig_h))
213
+ ax.set_xlim(0, n_cols)
214
+ ax.set_ylim(0, n_rows)
215
+ ax.axis('off')
216
+
217
+ def cell(row, col, text, bg, tc="#1F3864", bold=False, fs=8):
218
+ ax.add_patch(plt.Rectangle(
219
+ (col, n_rows - row - 1), 1, 1,
220
+ facecolor=bg, edgecolor="#AAAAAA", linewidth=0.5, zorder=1))
221
+ ax.text(col + 0.5, n_rows - row - 0.5, text,
222
+ ha='center', va='center', fontsize=fs,
223
+ fontweight='bold' if bold else 'normal',
224
+ color=tc, zorder=2, wrap=True)
225
+
226
+ # Column header row
227
+ col_widths = [1.5, 1, 1, 1, 0.8] # proportional, but we draw on a 5-unit grid
228
+ cell(0, 0, "Scheme / Row", HEADER_CLR, "white", bold=True, fs=7.5)
229
+ for ci, p in enumerate(PERIODS, 1):
230
+ cell(0, ci, p, HEADER_CLR, "white", bold=True, fs=10)
231
+
232
+ # Data rows
233
+ cur = 1
234
+ for h in holdings_data:
235
+ rank = h.get("rank_in_category")
236
+ total = h.get("total_in_category")
237
+ q = get_quartile(rank, total)
238
+ qc = Q_COLORS[q]
239
+ q_lbl = f"Q{q}"
240
+ name = str(h.get("scheme_name", ""))[:22]
241
+
242
+ for ri, rl in enumerate(ROW_LABELS):
243
+ if ri == 0:
244
+ lbl = f"{name}\n[BM]"
245
+ bg = BM_CLR
246
+ elif ri == 1:
247
+ lbl = "[Category]"
248
+ bg = CAT_CLR
249
+ else:
250
+ lbl = f"[Scheme β€” {q_lbl}]"
251
+ bg = qc
252
+
253
+ cell(cur + ri, 0, lbl, bg, bold=(ri == 2), fs=6.5)
254
+
255
+ for ci, pk in enumerate(PERIOD_KEYS, 1):
256
+ if ri == 0:
257
+ v = fmt(h.get(f"cagr_{pk}_bm"))
258
+ bg_c = BM_CLR
259
+ elif ri == 1:
260
+ v = fmt(h.get(f"cagr_{pk}_cat"))
261
+ bg_c = CAT_CLR
262
+ else:
263
+ fv = h.get(f"cagr_{pk}")
264
+ bmv = h.get(f"cagr_{pk}_bm")
265
+ v = fmt(fv)
266
+ bg_c = qc
267
+ # Green tick if fund beats benchmark this period
268
+ if fv is not None and bmv is not None and float(fv) >= float(bmv):
269
+ ax.text(ci + 0.88, n_rows - (cur + ri) - 0.18,
270
+ "βœ“", fontsize=8, color="#006400", va='center', zorder=3)
271
+
272
+ cell(cur + ri, ci, v, bg_c, bold=(ri == 2), fs=8)
273
+
274
+ # Divider between schemes
275
+ y = n_rows - (cur + rows_per) - 0.02
276
+ ax.axhline(y=y, xmin=0, xmax=1, color="#555555", linewidth=1.0, zorder=4)
277
+ cur += rows_per
278
+
279
+ # Legend
280
+ patches = [mpatches.Patch(facecolor=Q_COLORS[i], edgecolor='#AAAAAA',
281
+ label=f"Q{i} – {['Top Quartile','Above Avg','Below Avg','Bottom Quartile'][i-1]}")
282
+ for i in range(1, 5)]
283
+ ax.legend(handles=patches, loc='lower center',
284
+ bbox_to_anchor=(0.5, -0.09), ncol=4, fontsize=7.5, frameon=False)
285
+
286
+ ax.set_title("Quartile Analysis β€” Scheme vs Benchmark & Category Average",
287
+ fontsize=10, fontweight='bold', color=HEADER_CLR, pad=10)
288
+ fig.tight_layout()
289
+ return _buf(fig)
290
+
291
+
292
+ def wealth_projection_chart(projection: Dict[int, float], current_value: float) -> io.BytesIO:
293
+ """Line chart showing projected wealth growth at 12% over years."""
294
+ years = [0] + list(projection.keys())
295
+ values = [current_value] + list(projection.values())
296
+
297
+ fig, ax = plt.subplots(figsize=(5, 3))
298
+ ax.plot(years, values, marker='o', color=BRAND_ACCENT, linewidth=2, markersize=6)
299
+
300
+ for yr, val in zip(years, values):
301
+ ax.annotate(f'β‚Ή{val/1e5:.1f}L', (yr, val),
302
+ textcoords="offset points", xytext=(0, 8),
303
+ ha='center', fontsize=7.5, color=BRAND_BLUE)
304
+
305
+ ax.fill_between(years, values, alpha=0.15, color=BRAND_ACCENT)
306
+ ax.set_xticks(years)
307
+ ax.set_xticklabels([f'Now' if y == 0 else f'{y}Y' for y in years], fontsize=8)
308
+ ax.set_ylabel("Portfolio Value (β‚Ή)", fontsize=8, color='gray')
309
+ ax.set_title("Wealth Projection @ 12% p.a.", fontsize=10, fontweight='bold', color=BRAND_BLUE)
310
+ ax.spines[['top', 'right']].set_visible(False)
311
+ ax.yaxis.grid(True, linestyle='--', alpha=0.4)
312
+ ax.set_axisbelow(True)
313
+ fig.tight_layout()
314
+ return _buf(fig)
src/csv_enrichment.py ADDED
@@ -0,0 +1,941 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """CSV Enrichment β€” missing-cell filler for mutual fund statistics.
2
+
3
+ Fill pipeline (in order):
4
+ 0. SCHEME CODE RESOLUTION β€” fuzzy-match missing scheme codes via mfapi
5
+ 1. TRIAGE β€” classify every missing cell
6
+ 2. NAV ENGINE β€” compute trailing-3Y metrics from NAV history
7
+ 3. WEB SEARCH (Firecrawl) β€” scrape trusted sites for remaining gaps
8
+ 4. CATEGORY MEDIAN β€” last-resort statistical imputation
9
+
10
+ Fixes vs original:
11
+ β€’ Phase-label typo in log (Phase 4 β†’ Phase 5 for imputation step)
12
+ β€’ Unknown launch date β†’ is_young = False (attempt search, don't silently impute)
13
+ β€’ _normalize_fund_name uses re.sub to handle multi-hyphen sequences
14
+ β€’ scheme code resolution runs BEFORE triage so NAV engine fires for more funds
15
+ β€’ Standard Deviation now included in NAV-computable metrics
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import csv
21
+ import os
22
+ import re
23
+ from dataclasses import dataclass, field
24
+ from datetime import datetime
25
+ from pathlib import Path
26
+ from statistics import median
27
+ from typing import Any
28
+
29
+ import requests
30
+
31
+ from src.nav_metrics_engine import NavEngineCache, compute_nav_metrics_for_scheme
32
+ from src.scheme_resolver import (
33
+ resolve_missing_scheme_codes,
34
+ resolve_scheme_code_for_fund_name,
35
+ )
36
+
37
+ # Fund names attempted by NAV engine this session
38
+ _NAV_ATTEMPTED_FUNDS: set[str] = set()
39
+
40
+
41
+ # ── Constants ────────────────────────────────────────────────────────────────
42
+
43
+ MISSING_TOKENS = {"", "-", "na", "n/a", "n/a*", "nan", "none", "null", "β€”"}
44
+
45
+ TARGET_COLUMNS = (
46
+ "Alpha",
47
+ "Beta",
48
+ "Standard Deviation",
49
+ "Sharpe Ratio",
50
+ "Volatility",
51
+ "Mean",
52
+ "Sortino Ratio",
53
+ "Up Market Capture\nRatio",
54
+ "Down Market Capture\nRatio",
55
+ "Maximum Drawdown",
56
+ "R-Squared",
57
+ "Information Ratio",
58
+ "P/E Ratio",
59
+ "P/B Ratio",
60
+ )
61
+
62
+ # For all of these risk/ratio metrics, a literal numeric 0 is usually a
63
+ # data-quality gap rather than a meaningful "zero risk" value. We therefore
64
+ # treat 0 as missing so that enrichment (NAV engine + web search) can attempt
65
+ # to backfill real numbers.
66
+ ZERO_AS_MISSING_COLUMNS = set(TARGET_COLUMNS)
67
+
68
+ # ALL metrics that are equity-specific and should NOT be attempted
69
+ # via NAV engine or web search for debt/liquid/overnight funds.
70
+ # Sharpe, Sortino, Volatility etc. ARE computed by NAV engine for equity
71
+ # but for debt funds they either don't exist or are meaningless.
72
+ EQUITY_ONLY_METRICS = {
73
+ "Alpha",
74
+ "Beta",
75
+ "Standard Deviation",
76
+ "Sharpe Ratio",
77
+ "Volatility",
78
+ "Mean",
79
+ "Sortino Ratio",
80
+ "Up Market Capture\nRatio",
81
+ "Down Market Capture\nRatio",
82
+ "Maximum Drawdown",
83
+ "R-Squared",
84
+ "Information Ratio",
85
+ "P/E Ratio",
86
+ "P/B Ratio",
87
+ }
88
+
89
+ DEBT_CATEGORIES_PREFIXES = (
90
+ "Debt:", "Liquid", "Overnight", "Money Market", "Gilt",
91
+ "Fixed Maturity", "Interval Fund", "FMP",
92
+ )
93
+
94
+ MIN_YEARS_FOR_RISK_METRICS = 3
95
+
96
+ TRUSTED_DOMAINS = (
97
+ "valueresearchonline.com",
98
+ "morningstar.in",
99
+ "moneycontrol.com",
100
+ "advisorkhoj.com",
101
+ "amfiindia.com",
102
+ "tickertape.in",
103
+ )
104
+
105
+ METRIC_ALIASES: dict[str, list[str]] = {
106
+ "Alpha": ["alpha"],
107
+ "Beta": ["beta"],
108
+ "Standard Deviation": ["standard deviation", "std dev", "std. dev"],
109
+ "Sharpe Ratio": ["sharpe ratio", "sharpe"],
110
+ "Volatility": ["volatility"],
111
+ "Mean": ["mean", "mean return"],
112
+ "Sortino Ratio": ["sortino ratio", "sortino"],
113
+ "Up Market Capture\nRatio": ["upside", "up market capture", "upside capture", "up capture"],
114
+ "Down Market Capture\nRatio": ["downside", "down market capture", "downside capture", "down capture"],
115
+ "Maximum Drawdown": ["maximum drawdown", "max drawdown", "maximum"],
116
+ "R-Squared": ["r-squared", "r squared", "r2", "rΒ²"],
117
+ "Information Ratio": ["information ratio", "info ratio"],
118
+ "P/E Ratio": ["p/e ratio", "p/e", "pe ratio", "pe"],
119
+ "P/B Ratio": ["p/b ratio", "p/b", "pb ratio", "pb"],
120
+ }
121
+
122
+
123
+ # ── Config & Result ──────────────────────────────────────────────────────────
124
+
125
+ @dataclass
126
+ class EnrichmentConfig:
127
+ enabled: bool = True
128
+ max_cells: int | None = None
129
+ min_confidence: float = 0.65
130
+ search_limit: int = 5
131
+ impute_unresolved: bool = True
132
+ filter_category: str | None = None
133
+ target_columns: tuple[str, ...] = TARGET_COLUMNS
134
+ trusted_domains: tuple[str, ...] = TRUSTED_DOMAINS
135
+ enable_nav_engine: bool = True
136
+ resolve_scheme_codes: bool = True # run pre-triage code resolution
137
+ web_search_pe_pb_only: bool = False # limit web search to P/E and P/B only
138
+
139
+
140
+ @dataclass
141
+ class EnrichmentResult:
142
+ input_csv_path: str
143
+ enriched_csv_path: str
144
+ scratchpad_path: str | None = None
145
+ examined_cells: int = 0
146
+ updated_cells: int = 0
147
+ imputed_cells: int = 0
148
+ skipped_cells: int = 0
149
+ resolved_codes: int = 0 # NEW: how many scheme codes were resolved
150
+ # Optional breakdowns used by older callers / UIs
151
+ nav_cells: int = 0 # cells filled via NAV engine
152
+ web_cells: int = 0 # cells filled via web search
153
+ errors: list[str] = field(default_factory=list)
154
+
155
+
156
+ # ── Triage labels ────────────────────────────────────────────────────────────
157
+
158
+ TRIAGE_YOUNG = "YOUNG_FUND"
159
+ TRIAGE_NOT_APPLICABLE = "NOT_APPLICABLE"
160
+ TRIAGE_SEARCHABLE = "SEARCHABLE"
161
+
162
+
163
+ # ── Helpers ──────────────────────────────────────────────────────────────────
164
+
165
+ def _load_env() -> None:
166
+ env_path = Path(__file__).resolve().parent.parent / ".env"
167
+ if not env_path.exists():
168
+ return
169
+ for line in env_path.read_text(encoding="utf-8").splitlines():
170
+ raw = line.strip()
171
+ if not raw or raw.startswith("#") or "=" not in raw:
172
+ continue
173
+ k, v = raw.split("=", 1)
174
+ os.environ.setdefault(k.strip(), v.strip())
175
+
176
+
177
+ def _is_missing(val: str | None) -> bool:
178
+ return (val or "").strip().lower() in MISSING_TOKENS
179
+
180
+
181
+ def _parse_launch_date(val: str | None) -> datetime | None:
182
+ if not val:
183
+ return None
184
+ for fmt in ("%d-%m-%Y", "%Y-%m-%d", "%d/%m/%Y"):
185
+ try:
186
+ return datetime.strptime(val.strip(), fmt)
187
+ except ValueError:
188
+ continue
189
+ return None
190
+
191
+
192
+ def _is_debt_category(category: str) -> bool:
193
+ cat = (category or "").strip().lower()
194
+ for prefix in DEBT_CATEGORIES_PREFIXES:
195
+ if cat.startswith(prefix.lower()):
196
+ return True
197
+ return False
198
+
199
+
200
+ def _normalize_fund_name(name: str) -> str:
201
+ # FIX: use re.sub so multi-hyphen runs collapse to a single space
202
+ return re.sub(r"-+", " ", name).strip()
203
+
204
+
205
+ def _build_category_medians(
206
+ rows: list[dict[str, str]], columns: tuple[str, ...]
207
+ ) -> dict[str, dict[str, float]]:
208
+ """Returns {category: {column: median_value}}."""
209
+ buckets: dict[str, dict[str, list[float]]] = {}
210
+ for row in rows:
211
+ cat = row.get("Category", "")
212
+ if not cat:
213
+ continue
214
+ if cat not in buckets:
215
+ buckets[cat] = {c: [] for c in columns}
216
+ for col in columns:
217
+ raw = (row.get(col) or "").strip().replace("%", "").replace(",", "")
218
+ if raw.lower() in MISSING_TOKENS:
219
+ continue
220
+ try:
221
+ buckets[cat][col].append(float(raw))
222
+ except ValueError:
223
+ pass
224
+
225
+ result: dict[str, dict[str, float]] = {}
226
+ for cat, col_map in buckets.items():
227
+ result[cat] = {}
228
+ for col, vals in col_map.items():
229
+ if vals:
230
+ result[cat][col] = round(median(vals), 4)
231
+ return result
232
+
233
+
234
+ # ── Triage ───────────────────────────────────────────────────────────────────
235
+
236
+ @dataclass
237
+ class TriagedCell:
238
+ row_idx: int
239
+ fund_name: str
240
+ category: str
241
+ column: str
242
+ current_value: str
243
+ label: str
244
+ reason: str
245
+
246
+
247
+ def _triage_missing_cells(
248
+ rows: list[dict[str, str]],
249
+ config: EnrichmentConfig,
250
+ ) -> list[TriagedCell]:
251
+ """Classify every missing cell with reasoning."""
252
+ today = datetime.now()
253
+ cells: list[TriagedCell] = []
254
+
255
+ for idx, row in enumerate(rows):
256
+ fund = row.get("Fund", "")
257
+ cat = row.get("Category", "")
258
+ launch_str = row.get("Launch Date", "")
259
+
260
+ if config.filter_category and cat != config.filter_category:
261
+ continue
262
+
263
+ launch_dt = _parse_launch_date(launch_str)
264
+
265
+ if launch_dt is not None:
266
+ age_years = (today - launch_dt).days / 365.25
267
+ is_young = age_years < MIN_YEARS_FOR_RISK_METRICS
268
+ else:
269
+ # FIX: unknown date β†’ do NOT silently mark as young; attempt search
270
+ is_young = False
271
+
272
+ is_debt = _is_debt_category(cat)
273
+
274
+ for col in config.target_columns:
275
+ raw = (row.get(col) or "").strip()
276
+
277
+ # Base missing check (blank, "-", "N/A", etc.)
278
+ is_missing_val = _is_missing(raw)
279
+
280
+ # Additionally, for all ZERO_AS_MISSING_COLUMNS, treat an exact
281
+ # numeric 0 as "missing" so enrichment will try to fill it.
282
+ if not is_missing_val and col in ZERO_AS_MISSING_COLUMNS:
283
+ norm = raw.replace("%", "").replace(",", "").strip()
284
+ try:
285
+ if float(norm) == 0.0:
286
+ is_missing_val = True
287
+ except ValueError:
288
+ pass
289
+
290
+ if not is_missing_val:
291
+ continue
292
+
293
+ if is_young:
294
+ cells.append(TriagedCell(
295
+ row_idx=idx, fund_name=fund, category=cat, column=col,
296
+ current_value=raw, label=TRIAGE_YOUNG,
297
+ reason=(f"Fund launched {launch_str or '(unknown)'}, "
298
+ f"<{MIN_YEARS_FOR_RISK_METRICS}yr history β€” metric not computed yet"),
299
+ ))
300
+ else:
301
+ cells.append(TriagedCell(
302
+ row_idx=idx, fund_name=fund, category=cat, column=col,
303
+ current_value=raw, label=TRIAGE_SEARCHABLE,
304
+ reason=(f"Fund launched {launch_str or '(unknown date)'}, "
305
+ f"category '{cat}' β€” metric should exist, attempting NAV/web"),
306
+ ))
307
+
308
+ return cells
309
+
310
+
311
+ # ── Markdown table parser ────────────────────────────────────────────────────
312
+
313
+ def _extract_number(text: str) -> float | None:
314
+ text = text.strip().replace(",", "")
315
+ if text.lower() in MISSING_TOKENS or text == "β€”":
316
+ return None
317
+ match = re.search(r"-?\d+\.?\d*", text)
318
+ if match:
319
+ try:
320
+ return float(match.group())
321
+ except ValueError:
322
+ return None
323
+ return None
324
+
325
+
326
+ def _parse_table_row(markdown: str, alias: str) -> float | None:
327
+ """Extract the first numeric cell after the label in a markdown table row.
328
+
329
+ For a row like: | Alpha | 1.59 | -0.56 | 8.25 |
330
+ Returns 1.59 β€” the fund's own column (leftmost numeric value after the label).
331
+ This is intentional: sites like Morningstar show Fund | Category | Index
332
+ and we want the fund value, not the category or index value.
333
+ """
334
+ pattern = re.compile(
335
+ r"\|\s*" + re.escape(alias) + r"\s*\|(.+?)(?:\n|$)",
336
+ re.IGNORECASE,
337
+ )
338
+ for m in pattern.finditer(markdown):
339
+ rest = m.group(1)
340
+ cells = [c.strip() for c in rest.split("|")]
341
+ for cell in cells:
342
+ val = _extract_number(cell)
343
+ if val is not None:
344
+ return val
345
+ return None
346
+
347
+
348
+ def _parse_metrics_from_markdown(
349
+ markdown: str, wanted_metrics: list[str]
350
+ ) -> dict[str, float | None]:
351
+ found: dict[str, float | None] = {}
352
+ for metric in wanted_metrics:
353
+ aliases = METRIC_ALIASES.get(metric, [metric.lower()])
354
+ best_val: float | None = None
355
+ for alias in aliases:
356
+ if alias.lower() not in markdown.lower():
357
+ continue
358
+ val = _parse_table_row(markdown, alias)
359
+ if val is not None:
360
+ best_val = val
361
+ break
362
+ found[metric] = best_val
363
+ return found
364
+
365
+
366
+ # ── Web search (Firecrawl) ───────────────────────────────────────────────────
367
+
368
+ def _call_tavily_search(query: str, api_key: str, limit: int = 5) -> list[dict]:
369
+ """Search using Tavily API. Returns list of dicts with 'url' and 'markdown' keys."""
370
+ try:
371
+ resp = requests.post(
372
+ "https://api.tavily.com/search",
373
+ headers={"Content-Type": "application/json"},
374
+ json={
375
+ "api_key": api_key,
376
+ "query": query,
377
+ "max_results": limit,
378
+ "include_raw_content": True,
379
+ "search_depth": "advanced",
380
+ },
381
+ timeout=30,
382
+ )
383
+ resp.raise_for_status()
384
+ data = resp.json()
385
+ results = []
386
+ for r in data.get("results", []):
387
+ results.append({
388
+ "url": r.get("url", ""),
389
+ "markdown": r.get("raw_content") or r.get("content", ""),
390
+ })
391
+ return results
392
+ except Exception as exc:
393
+ print(f" [tavily] search error: {exc}")
394
+ return []
395
+
396
+
397
+ # Keep firecrawl as alias name so _search_fund_metrics calls work unchanged
398
+ _call_firecrawl_search = _call_tavily_search
399
+
400
+
401
+ def _scrape_url(url: str, api_key: str) -> str:
402
+ """Fetch page content using Tavily extract API."""
403
+ try:
404
+ resp = requests.post(
405
+ "https://api.tavily.com/extract",
406
+ headers={"Content-Type": "application/json"},
407
+ json={"api_key": api_key, "urls": [url]},
408
+ timeout=30,
409
+ )
410
+ resp.raise_for_status()
411
+ data = resp.json()
412
+ results = data.get("results", [])
413
+ if results:
414
+ return results[0].get("raw_content", "")
415
+ except Exception as exc:
416
+ print(f" [tavily extract] error for {url}: {exc}")
417
+ return ""
418
+
419
+
420
+ def _derive_morningstar_risk_url(any_ms_url: str) -> str | None:
421
+ if "morningstar.in/mutualfunds/" not in any_ms_url:
422
+ return None
423
+ for suffix in ("fund-factsheet.aspx", "overview.aspx", "portfolio.aspx",
424
+ "performance.aspx", "detailed-portfolio.aspx"):
425
+ if suffix in any_ms_url:
426
+ return any_ms_url.replace(suffix, "risk-ratings.aspx")
427
+ if "risk-ratings.aspx" in any_ms_url:
428
+ return any_ms_url
429
+ return None
430
+
431
+
432
+ def _derive_morningstar_portfolio_url(any_ms_url: str) -> str | None:
433
+ """Derive the Morningstar portfolio page (for P/E and P/B)."""
434
+ if "morningstar.in/mutualfunds/" not in any_ms_url:
435
+ return None
436
+ return re.sub(
437
+ r"(fund-factsheet|overview|risk-ratings|performance|detailed-portfolio)\.aspx",
438
+ "portfolio.aspx",
439
+ any_ms_url,
440
+ )
441
+
442
+
443
+ def _search_fund_metrics(
444
+ fund_name: str,
445
+ missing_metrics: list[str],
446
+ config: EnrichmentConfig,
447
+ firecrawl_key: str,
448
+ ) -> tuple[dict[str, float | None], list[str]]:
449
+ from urllib.parse import urlparse
450
+
451
+ readable = _normalize_fund_name(fund_name)
452
+ query = f"{readable} risk rating alpha beta sharpe morningstar"
453
+ print(f" [search] query: {query[:80]}")
454
+
455
+ results = _call_firecrawl_search(query, firecrawl_key, config.search_limit)
456
+ if not results:
457
+ print(" [search] no results")
458
+ return {m: None for m in missing_metrics}, []
459
+
460
+ trusted, other = [], []
461
+ for r in results:
462
+ url = r.get("url", "")
463
+ domain = urlparse(url).netloc.lower().replace("www.", "")
464
+ (trusted if any(td in domain for td in config.trusted_domains) else other).append(r)
465
+ use = (trusted + other)[:3]
466
+
467
+ source_urls = [r.get("url", "") for r in use]
468
+ print(f" [search] using {len(use)} sources: {[urlparse(u).netloc for u in source_urls]}")
469
+
470
+ combined = ""
471
+ for r in use:
472
+ md = r.get("markdown", "")
473
+ if md:
474
+ combined += f"\n\n--- {r.get('url', '')} ---\n{md}"
475
+
476
+ # Morningstar: scrape risk-ratings page if not already in results
477
+ ms_risk_url = None
478
+ for r in use:
479
+ ms_risk_url = _derive_morningstar_risk_url(r.get("url", ""))
480
+ if ms_risk_url:
481
+ break
482
+ if ms_risk_url and "risk-ratings" not in " ".join(source_urls):
483
+ print(f" [scrape] Morningstar risk page: {ms_risk_url}")
484
+ risk_md = _scrape_url(ms_risk_url, firecrawl_key)
485
+ if risk_md:
486
+ combined += f"\n\n--- {ms_risk_url} ---\n{risk_md}"
487
+ source_urls.append(ms_risk_url)
488
+
489
+ # Morningstar: scrape portfolio page for P/E and P/B
490
+ pe_pb_needed = {"P/E Ratio", "P/B Ratio"} & set(missing_metrics)
491
+ if pe_pb_needed and ms_risk_url:
492
+ ms_port_url = _derive_morningstar_portfolio_url(ms_risk_url)
493
+ if ms_port_url and ms_port_url not in source_urls:
494
+ print(f" [scrape] Morningstar portfolio page: {ms_port_url}")
495
+ port_md = _scrape_url(ms_port_url, firecrawl_key)
496
+ if port_md:
497
+ combined += f"\n\n--- {ms_port_url} ---\n{port_md}"
498
+ source_urls.append(ms_port_url)
499
+
500
+ # If we still have no markdown content, or if later we still miss
501
+ # metrics, we'll do a second pass focused on ValueResearch.
502
+ if not combined.strip():
503
+ print(" [search] no markdown from initial sources; retrying via valueresearchonline…")
504
+ vr_query = f"{readable} {' '.join(missing_metrics)} valueresearchonline"
505
+ vr_results = _call_firecrawl_search(vr_query, firecrawl_key, config.search_limit)
506
+ if vr_results:
507
+ vr_combined = ""
508
+ for r in vr_results:
509
+ url = r.get("url", "")
510
+ domain = urlparse(url).netloc.lower().replace("www.", "")
511
+ if "valueresearchonline.com" not in domain:
512
+ continue
513
+ md = r.get("markdown", "")
514
+ if md:
515
+ vr_combined += f"\n\n--- {url} ---\n{md}"
516
+ source_urls.append(url)
517
+ combined = vr_combined
518
+
519
+ if not combined.strip():
520
+ print(" [search] no markdown content after ValueResearch retry")
521
+ return {m: None for m in missing_metrics}, source_urls
522
+
523
+ found = _parse_metrics_from_markdown(combined, missing_metrics)
524
+ for m, v in found.items():
525
+ print(f" [parsed] {m} = {v if v is not None else 'NOT FOUND'}")
526
+
527
+ return found, source_urls
528
+
529
+
530
+ # ── Scratchpad ───────────────────────────────────────────────────────────────
531
+
532
+ def _write_scratchpad(
533
+ path: Path,
534
+ triaged: list[TriagedCell],
535
+ resolved_codes: dict[str, str],
536
+ nav_results: dict[str, dict[str, float | None]],
537
+ web_results: dict[str, dict[str, float | None]],
538
+ web_sources: dict[str, list[str]],
539
+ medians_used: list[tuple[str, str, float]],
540
+ nav_filled: list[tuple[str, str, float]],
541
+ web_filled: list[tuple[str, str, float]],
542
+ ) -> None:
543
+ lines = [
544
+ "=" * 70,
545
+ "ENRICHMENT SCRATCHPAD",
546
+ f"Generated: {datetime.now().isoformat()}",
547
+ "=" * 70, "",
548
+ ]
549
+
550
+ if resolved_codes:
551
+ lines += ["-" * 70, f"SCHEME CODES RESOLVED ({len(resolved_codes)})", "-" * 70]
552
+ for fund, code in resolved_codes.items():
553
+ lines.append(f" {fund[:60]:60s} β†’ {code}")
554
+ lines.append("")
555
+
556
+ young = [c for c in triaged if c.label == TRIAGE_YOUNG]
557
+ na = [c for c in triaged if c.label == TRIAGE_NOT_APPLICABLE]
558
+ searchable = [c for c in triaged if c.label == TRIAGE_SEARCHABLE]
559
+
560
+ lines += [
561
+ f"TOTAL MISSING CELLS: {len(triaged)}",
562
+ f" YOUNG_FUND (auto-impute): {len(young)}",
563
+ f" NOT_APPLICABLE (auto-impute): {len(na)}",
564
+ f" SEARCHABLE (nav/web): {len(searchable)}",
565
+ "",
566
+ "-" * 70, "TRIAGE DECISIONS", "-" * 70,
567
+ ]
568
+ for c in triaged:
569
+ lines.append(f" [{c.label:16s}] {c.fund_name} :: {c.column}")
570
+ lines.append(f" Reason: {c.reason}")
571
+ lines.append("")
572
+
573
+ if nav_results:
574
+ lines += ["-" * 70, "NAV ENGINE RESULTS (TRAILING 3Y)", "-" * 70]
575
+ for fund_key, metrics in nav_results.items():
576
+ lines.append(f" Fund: {fund_key}")
577
+ for metric, val in metrics.items():
578
+ lines.append(f" {metric}: {'FOUND = ' + str(val) if val is not None else 'NOT_FOUND'}")
579
+ lines.append("")
580
+
581
+ if web_results:
582
+ lines += ["-" * 70, "WEB SEARCH RESULTS", "-" * 70]
583
+ for fund_key, metrics in web_results.items():
584
+ lines.append(f" Fund: {fund_key}")
585
+ for s in web_sources.get(fund_key, []):
586
+ lines.append(f" Source: {s}")
587
+ for metric, val in metrics.items():
588
+ lines.append(f" {metric}: {'FOUND = ' + str(val) if val is not None else 'NOT_FOUND'}")
589
+ lines.append("")
590
+
591
+ for section_label, items in [
592
+ (f"NAV-FILLED VALUES ({len(nav_filled)})", nav_filled),
593
+ (f"WEB-FILLED VALUES ({len(web_filled)})", web_filled),
594
+ (f"CATEGORY-MEDIAN IMPUTED ({len(medians_used)})", medians_used),
595
+ ]:
596
+ if items:
597
+ lines += ["-" * 70, section_label, "-" * 70]
598
+ for fund, col, val in items:
599
+ lines.append(f" {fund} :: {col} = {val}")
600
+ lines.append("")
601
+
602
+ lines += ["=" * 70, "END OF SCRATCHPAD", "=" * 70]
603
+ path.write_text("\n".join(lines), encoding="utf-8")
604
+
605
+
606
+ # ── Main entry point ─────────────────────────────────────────────────────────
607
+
608
+ def enrich_csv(
609
+ csv_path: str,
610
+ config: EnrichmentConfig | None = None,
611
+ ) -> EnrichmentResult:
612
+ """Parse CSV β†’ resolve codes β†’ triage β†’ NAV engine β†’ web fallback β†’ median impute β†’ write.
613
+
614
+ (Previously named enrich_csv_with_firecrawl_and_kimi; renamed for clarity.)
615
+ """
616
+ if config is None:
617
+ config = EnrichmentConfig()
618
+
619
+ _load_env()
620
+
621
+ src = Path(csv_path)
622
+ result = EnrichmentResult(input_csv_path=csv_path, enriched_csv_path=csv_path)
623
+
624
+ if not config.enabled or not src.exists():
625
+ return result
626
+
627
+ with open(src, encoding="utf-8-sig", newline="") as f:
628
+ reader = csv.DictReader(f)
629
+ fieldnames = list(reader.fieldnames or [])
630
+ rows = list(reader)
631
+
632
+ if not rows:
633
+ return result
634
+
635
+ # ── Phase 0: Scheme Code Resolution ─────────────────────────────────
636
+ resolved_codes: dict[str, str] = {}
637
+ if config.resolve_scheme_codes:
638
+ print("[enrichment] Phase 0: Resolving missing scheme codes…")
639
+ rows, resolved_codes = resolve_missing_scheme_codes(rows, verbose=True)
640
+ result.resolved_codes = len(resolved_codes)
641
+
642
+ # ── Phase 1: Triage ──────────────────────────────────────────────────
643
+ print("[enrichment] Phase 1: Triage β€” classifying missing cells…")
644
+ triaged = _triage_missing_cells(rows, config)
645
+ result.examined_cells = len(triaged)
646
+
647
+ if not triaged:
648
+ print("[enrichment] No missing cells found.")
649
+ _write_output(src, rows, fieldnames, result)
650
+ return result
651
+
652
+ searchable = [c for c in triaged if c.label == TRIAGE_SEARCHABLE]
653
+ imputable = [c for c in triaged if c.label != TRIAGE_SEARCHABLE]
654
+ print(f"[enrichment] {len(triaged)} missing cells: "
655
+ f"{len(searchable)} SEARCHABLE, {len(imputable)} auto-impute")
656
+
657
+ if config.max_cells is not None:
658
+ searchable = searchable[:config.max_cells]
659
+
660
+ # ── Phase 2: Category medians ────────────────────────────────────────
661
+ print("[enrichment] Phase 2: Computing category medians…")
662
+ cat_medians = _build_category_medians(rows, config.target_columns)
663
+
664
+ # ── Phase 3: NAV engine ──────────────────────────────────────────────
665
+ nav_results: dict[str, dict[str, float | None]] = {}
666
+ nav_filled: list[tuple[str, str, float]] = []
667
+
668
+ if searchable and config.enable_nav_engine:
669
+ print("[enrichment] Phase 3: NAV engine β€” computing trailing 3Y metrics…")
670
+ nav_cache = NavEngineCache()
671
+
672
+ # All funds with missing cells go through NAV engine β€” including debt/liquid.
673
+ # Debt funds can have valid Sharpe, Mean, Volatility etc. from their NAV history.
674
+ searchable_for_nav = searchable
675
+
676
+ row_groups: dict[int, list[TriagedCell]] = {}
677
+ for cell in searchable_for_nav:
678
+ row_groups.setdefault(cell.row_idx, []).append(cell)
679
+
680
+ total_rows = len(row_groups)
681
+ processed_count = 0
682
+ nav_lock = __import__("threading").Lock()
683
+
684
+ NAV_WORKERS = 20 # mfapi is stateless REST β€” scales well beyond 12
685
+
686
+ # ── Pre-warm: bulk load NAV + benchmarks before workers touch network ──
687
+ # Step 1: Pull all valid scheme codes and unique benchmarks from rows
688
+ from src.nav_metrics_engine import _bulk_preload_cache, _prewarm_benchmarks
689
+ _scheme_codes = [
690
+ (rows[ri].get("Scheme Code") or "").strip()
691
+ for ri in row_groups
692
+ if (rows[ri].get("Scheme Code") or "").strip().isdigit()
693
+ ]
694
+ _bench_tickers_raw = [
695
+ rows[ri].get("Benchmark Type", "") for ri in row_groups
696
+ ]
697
+ # Step 2: Resolve benchmark type β†’ ticker (same logic as nav engine)
698
+ from src.nav_metrics_engine import resolve_benchmark_ticker
699
+ _bench_tickers = list(dict.fromkeys(
700
+ resolve_benchmark_ticker(b) for b in _bench_tickers_raw if b
701
+ ))
702
+ # Step 3: Bulk load from Neon in 1 SQL query (nav + bench keys)
703
+ _bulk_preload_cache(_scheme_codes, _bench_tickers)
704
+ # Step 4: Download any cold benchmark tickers in parallel NOW,
705
+ # before workers start β€” eliminates yfinance contention
706
+ _prewarm_benchmarks(_bench_tickers)
707
+
708
+ def _process_one_fund(args):
709
+ row_idx, cells = args
710
+ row = rows[row_idx]
711
+ fund_name = row.get("Fund", "")
712
+ scheme_code = (row.get("Scheme Code") or "").strip()
713
+ benchmark_type = row.get("Benchmark Type", "")
714
+ needed_metrics = [c.column for c in cells]
715
+
716
+ if not scheme_code:
717
+ return fund_name, {}, cells
718
+
719
+ metrics, skip = compute_nav_metrics_for_scheme(
720
+ scheme_code=scheme_code,
721
+ benchmark_type=benchmark_type,
722
+ needed_metrics=needed_metrics,
723
+ cache=nav_cache,
724
+ )
725
+
726
+ joined_reasons = " | ".join(skip.values()).lower()
727
+ should_refresh_code = (
728
+ "returned no nav history" in joined_reasons
729
+ or "nav history is stale" in joined_reasons
730
+ )
731
+ if should_refresh_code:
732
+ refreshed_code, _ = resolve_scheme_code_for_fund_name(fund_name)
733
+ if refreshed_code and refreshed_code != scheme_code:
734
+ row["Scheme Code"] = refreshed_code
735
+ metrics, skip = compute_nav_metrics_for_scheme(
736
+ scheme_code=refreshed_code,
737
+ benchmark_type=benchmark_type,
738
+ needed_metrics=needed_metrics,
739
+ cache=nav_cache,
740
+ )
741
+
742
+ return fund_name, metrics, cells
743
+
744
+ from concurrent.futures import ThreadPoolExecutor, as_completed
745
+
746
+ work_items = list(row_groups.items())
747
+ with ThreadPoolExecutor(max_workers=NAV_WORKERS) as executor:
748
+ futures = {executor.submit(_process_one_fund, item): item for item in work_items}
749
+ for fut in as_completed(futures):
750
+ fund_name, metrics, cells = fut.result()
751
+ with nav_lock:
752
+ processed_count += 1
753
+ nav_results[fund_name] = metrics
754
+ for cell in cells:
755
+ val = metrics.get(cell.column)
756
+ if val is not None:
757
+ rows[cell.row_idx][cell.column] = str(round(float(val), 4))
758
+ result.updated_cells += 1
759
+ nav_filled.append((fund_name, cell.column, float(val)))
760
+ # Only mark as attempted if MDD was actually filled β€”
761
+ # drawdown_zero_fix should still retry funds where MDD came back None
762
+ if metrics.get("Maximum Drawdown") is not None:
763
+ _NAV_ATTEMPTED_FUNDS.add(fund_name)
764
+ if processed_count % 20 == 0 or processed_count == total_rows:
765
+ print(f" [nav] {processed_count}/{total_rows} funds processed…")
766
+
767
+ # Keep only still-missing searchable cells for web phase
768
+ searchable = [c for c in searchable if _is_missing(rows[c.row_idx].get(c.column, ""))]
769
+ print(f"[enrichment] NAV phase resolved {len(nav_filled)} cells; "
770
+ f"{len(searchable)} remain for web search")
771
+ result.nav_cells = len(nav_filled)
772
+
773
+ # ── Phase 4: Web search ──────────────────────────────────────────────
774
+ web_results: dict[str, dict[str, float | None]] = {}
775
+ web_sources: dict[str, list[str]] = {}
776
+ web_filled: list[tuple[str, str, float]] = []
777
+
778
+ firecrawl_key = os.environ.get("TAVILY_API_KEY", "")
779
+
780
+ if searchable and firecrawl_key:
781
+ fund_groups: dict[str, list[TriagedCell]] = {}
782
+ for cell in searchable:
783
+ fund_groups.setdefault(cell.fund_name, []).append(cell)
784
+
785
+ print(f"[enrichment] Phase 4: Web search β€” {len(searchable)} cells "
786
+ f"across {len(fund_groups)} funds")
787
+
788
+ # ── Pre-impute non-PE/PB cells if pe_pb_only mode ────────────────
789
+ # Do this before the parallel search so workers only handle PE/PB
790
+ web_search_groups: dict[str, list[TriagedCell]] = {}
791
+ for fund_name, cells in fund_groups.items():
792
+ if config.web_search_pe_pb_only:
793
+ cells_to_impute = [c for c in cells if c.column not in ("P/E Ratio", "P/B Ratio")]
794
+ for cell in cells_to_impute:
795
+ med = cat_medians.get(cell.category, {}).get(cell.column)
796
+ if med is not None and config.impute_unresolved:
797
+ rows[cell.row_idx][cell.column] = str(med)
798
+ result.updated_cells += 1
799
+ result.imputed_cells += 1
800
+ cells_for_web = [c for c in cells if c.column in ("P/E Ratio", "P/B Ratio")]
801
+ else:
802
+ cells_for_web = cells
803
+ if cells_for_web:
804
+ web_search_groups[fund_name] = cells_for_web
805
+
806
+ WEB_WORKERS = 10 # Tavily allows concurrent requests; stay conservative
807
+ web_lock = __import__("threading").Lock()
808
+ web_done = [0]
809
+ total_web = len(web_search_groups)
810
+
811
+ def _search_one_fund(args):
812
+ fund_name, cells = args
813
+ missing_metrics = [c.column for c in cells]
814
+ found, sources = _search_fund_metrics(fund_name, missing_metrics, config, firecrawl_key)
815
+ return fund_name, cells, found, sources
816
+
817
+ from concurrent.futures import ThreadPoolExecutor as _WebTPE, as_completed as _web_as_completed
818
+ with _WebTPE(max_workers=WEB_WORKERS) as web_executor:
819
+ futures = {
820
+ web_executor.submit(_search_one_fund, item): item
821
+ for item in web_search_groups.items()
822
+ }
823
+ for fut in _web_as_completed(futures):
824
+ fund_name, cells, found, sources = fut.result()
825
+ with web_lock:
826
+ web_done[0] += 1
827
+ web_results[fund_name] = found
828
+ web_sources[fund_name] = sources
829
+ print(f"\n[{web_done[0]}/{total_web}] {fund_name}")
830
+ for cell in cells:
831
+ val = found.get(cell.column)
832
+ if val is not None:
833
+ rows[cell.row_idx][cell.column] = str(val)
834
+ result.updated_cells += 1
835
+ web_filled.append((fund_name, cell.column, val))
836
+ print(f" -> {cell.column} = {val} (web)")
837
+ else:
838
+ med = cat_medians.get(cell.category, {}).get(cell.column)
839
+ if med is not None and config.impute_unresolved:
840
+ rows[cell.row_idx][cell.column] = str(med)
841
+ result.imputed_cells += 1
842
+ print(f" ~> {cell.column} = {med} (median)")
843
+ else:
844
+ result.skipped_cells += 1
845
+ print(f" x> {cell.column} β€” not found, no median")
846
+
847
+ elif searchable and not firecrawl_key:
848
+ print("[enrichment] WARNING: TAVILY_API_KEY not set β€” skipping web search, using medians")
849
+ result.errors.append("TAVILY_API_KEY not set")
850
+ for cell in searchable:
851
+ med = cat_medians.get(cell.category, {}).get(cell.column)
852
+ if med is not None and config.impute_unresolved:
853
+ rows[cell.row_idx][cell.column] = str(med)
854
+ result.imputed_cells += 1
855
+ else:
856
+ result.skipped_cells += 1
857
+
858
+ # ── Phase 5: Impute non-searchable (YOUNG / NOT_APPLICABLE) cells ────
859
+ # FIX: was incorrectly labelled "Phase 4" in log
860
+ medians_used: list[tuple[str, str, float]] = []
861
+ if imputable and config.impute_unresolved:
862
+ print(f"\n[enrichment] Phase 5: Imputing {len(imputable)} non-searchable cells…")
863
+ for cell in imputable:
864
+ med = cat_medians.get(cell.category, {}).get(cell.column)
865
+ if med is not None:
866
+ rows[cell.row_idx][cell.column] = str(med)
867
+ result.imputed_cells += 1
868
+ medians_used.append((cell.fund_name, cell.column, med))
869
+ else:
870
+ result.skipped_cells += 1
871
+ elif imputable:
872
+ result.skipped_cells += len(imputable)
873
+
874
+ # Record how many cells came from web search
875
+ result.web_cells = len(web_filled)
876
+
877
+ # ── Phase 6: Write enriched CSV ──────────────────────────────────────
878
+ _write_output(src, rows, fieldnames, result)
879
+
880
+ # ── Phase 7: Scratchpad ──────────────────────────────────────────────
881
+ scratch_dir = Path("scratchpad")
882
+ scratch_dir.mkdir(exist_ok=True)
883
+ stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
884
+ scratch_path = scratch_dir / f"enrichment_{stamp}.txt"
885
+ _write_scratchpad(
886
+ scratch_path, triaged, resolved_codes,
887
+ nav_results, web_results, web_sources,
888
+ medians_used, nav_filled, web_filled,
889
+ )
890
+ result.scratchpad_path = str(scratch_path)
891
+
892
+ print(f"\n[enrichment] DONE β€” nav_filled={len(nav_filled)} web_filled={len(web_filled)} "
893
+ f"imputed={result.imputed_cells} skipped={result.skipped_cells}")
894
+ print(f"[enrichment] Enriched CSV : {result.enriched_csv_path}")
895
+ print(f"[enrichment] Scratchpad : {scratch_path}")
896
+ return result
897
+
898
+
899
+ def _write_output(
900
+ src: Path,
901
+ rows: list[dict[str, str]],
902
+ fieldnames: list[str],
903
+ result: EnrichmentResult,
904
+ ) -> None:
905
+ out_dir = src.parent / "enriched"
906
+ out_dir.mkdir(exist_ok=True)
907
+ out_path = out_dir / f"enriched_{src.name}"
908
+ with open(out_path, "w", encoding="utf-8-sig", newline="") as f:
909
+ writer = csv.DictWriter(f, fieldnames=fieldnames)
910
+ writer.writeheader()
911
+ writer.writerows(rows)
912
+ result.enriched_csv_path = str(out_path)
913
+
914
+
915
+ # Backward-compat alias (old name used in streamlit_app and run_enrichment_pipeline)
916
+ enrich_csv_with_firecrawl_and_kimi = enrich_csv
917
+
918
+
919
+ # ── Single metric lookup (for Streamlit UI) ──────────────────────────────────
920
+
921
+ def lookup_fund_metric_value(
922
+ fund_name: str,
923
+ column_name: str,
924
+ scheme_code: str = "",
925
+ config: EnrichmentConfig | None = None,
926
+ ) -> dict[str, Any]:
927
+ _load_env()
928
+ if config is None:
929
+ config = EnrichmentConfig()
930
+
931
+ firecrawl_key = os.environ.get("TAVILY_API_KEY", "")
932
+ if not firecrawl_key:
933
+ return {"status": "error", "message": "TAVILY_API_KEY not set"}
934
+
935
+ found, sources = _search_fund_metrics(fund_name, [column_name], config, firecrawl_key)
936
+ val = found.get(column_name)
937
+ if val is not None:
938
+ return {"status": "found", "fund": fund_name, "metric": column_name,
939
+ "value": val, "sources": sources}
940
+ return {"status": "not_found", "fund": fund_name, "metric": column_name,
941
+ "value": None, "sources": sources}
src/data_engine.py ADDED
@@ -0,0 +1,1210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Data Engine: -d mode
3
+
4
+ Reads the fund-stats CSV and exports processed Excel matching Processed data.xlsx format.
5
+
6
+ Layout (matching target XLSX):
7
+ - One combined sheet with all fund categories
8
+ - Header row (light green #C9FFCC)
9
+ - For each category:
10
+ - Category header row (no fill, bold text)
11
+ - BM Index row (Col A: #BAEAEE, CAGR cols F,G,H,I: #C4EFFF)
12
+ - Category Average row (Col A: #BAEAEE, CAGR cols F,G,H,I + P/E,P/B cols L,M: #C4EFFF)
13
+ - Fund rows sorted by score (weightage) descending, strictly largest to lowest
14
+ - Weightage scoring: Compare fund CAGR vs Category Average (NOT BM Index)
15
+ - 1Y CAGR beats Cat Avg: 2 pts
16
+ - 3Y CAGR beats Cat Avg: 3 pts
17
+ - 5Y CAGR beats Cat Avg: 4 pts
18
+ - 10Y CAGR beats Cat Avg: 5 pts
19
+ - Max possible: 14 pts
20
+ - Yellow background (#F1FFB6) on Col A only if Weightage >= 8
21
+ - NO green/red font coloring on CAGR cells (plain black only)
22
+ - Category Average row Col B is EMPTY (no benchmark type)
23
+ """
24
+
25
+ import csv
26
+ import math
27
+ import re
28
+ from datetime import datetime
29
+ from pathlib import Path
30
+ from typing import List, Optional, Tuple, Dict, Any
31
+
32
+ from openpyxl import Workbook
33
+ from openpyxl.styles import PatternFill, Font, Alignment, Border, Side
34
+ from openpyxl.utils import get_column_letter
35
+ from openpyxl.formatting.rule import Rule, CellIsRule, FormulaRule
36
+ from openpyxl.styles.differential import DifferentialStyle
37
+
38
+ from src.models import Fund
39
+ from src.weightage import compute_scores, drawdown_zero_fix
40
+ from src.reference_data import extract_reference_data, get_fund_weightage_from_reference, DEFAULT_REFERENCE_PATH
41
+
42
+
43
+ # ─── Color palette ─────────────────────────────────────────────────────────────────
44
+ FILL_HEADER = PatternFill(start_color="C9FFCC", end_color="C9FFCC", fill_type="solid")
45
+ FILL_BM_ROW = PatternFill(start_color="BAEAEE", end_color="BAEAEE", fill_type="solid")
46
+ FILL_BM_CAGR = PatternFill(start_color="C4EFFF", end_color="C4EFFF", fill_type="solid")
47
+ FILL_CAT_AVG = PatternFill(start_color="BAEAEE", end_color="BAEAEE", fill_type="solid")
48
+ FILL_CAT_CAGR = PatternFill(start_color="C4EFFF", end_color="C4EFFF", fill_type="solid")
49
+ FILL_WEIGHTED_YELLOW = PatternFill(start_color="FFFF00", end_color="FFFF00", fill_type="solid")
50
+ FILL_WEIGHTED_GREEN = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
51
+ FILL_WHITE = PatternFill(fill_type=None)
52
+ FILL_WEIGHT_REF = PatternFill(start_color="EDEDED", end_color="EDEDED", fill_type="solid") # light grey weight row
53
+
54
+ # Quartile fills
55
+ FILL_QUARTILE_GREEN = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
56
+ FILL_QUARTILE_YELLOW = PatternFill(start_color="FFFF00", end_color="FFFF00", fill_type="solid")
57
+ FILL_QUARTILE_ORANGE = PatternFill(start_color="FFC000", end_color="FFC000", fill_type="solid")
58
+ FILL_QUARTILE_RED = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
59
+
60
+ # ── Fonts β€” Arial for identical rendering on macOS + Windows ─────────────────
61
+ # openpyxl falls back gracefully when Arial is absent, but both platforms ship it.
62
+ FONT_DEFAULT = Font(name="Arial", size=8, color="000000")
63
+ FONT_DEFAULT_BOLD = Font(name="Arial", size=8, bold=True, color="000000")
64
+ FONT_HEADER = Font(name="Arial", size=8, bold=True, color="000000")
65
+ FONT_CAT_HEADER = Font(name="Arial", size=10, bold=True, color="000000")
66
+ FONT_WEIGHT_REF = Font(name="Arial", size=7, italic=True, color="666666") # subtle grey label
67
+
68
+ THIN = Side(border_style="thin", color="CCCCCC")
69
+ BORDER_THIN = Border(left=THIN, right=THIN, top=THIN, bottom=THIN)
70
+
71
+
72
+ # ─── Weight reference row data (advisor-revised March 2026) ──────────────────
73
+ # Shown beneath every category's column-header row as a read-only reference.
74
+ # Must match src/weightage.py WEIGHTS exactly.
75
+ # ↑ = Top-10 (higher better), ↓ = Bottom-10 (lower better)
76
+ WEIGHT_REF_ROW: Dict[str, str] = {
77
+ "ter": "0.15 ↓",
78
+ "turnover": "0.10 ↓",
79
+ "cagr_3y": "0.40 ↑",
80
+ "cagr_5y": "0.60 ↑",
81
+ "cagr_10y": "0.75 ↑",
82
+ "pe_ratio": "0.15 ↓",
83
+ "alpha": "1.00 ↑*", # * = Light Red if Ξ± < 1
84
+ "std_dev": "1.00 ↓",
85
+ "sharpe": "1.20 ↑",
86
+ "sortino": "1.30 ↑",
87
+ "down_capture": "1.00 ↓",
88
+ "max_drawdown": "1.35 ↑",
89
+ "info_ratio": "1.00 ↑*", # * = Light Red if IR < 0
90
+ "weightage": "10.00",
91
+ }
92
+
93
+
94
+ # ─── Column definitions ───────────────────────────────────────────────────────
95
+ # Tuple: (header_label, fund_attr, col_width, is_pct, decimal_places)
96
+ # Widths are calibrated so wrap_text = True keeps cells readable without
97
+ # the advisor needing to manually drag columns on either platform.
98
+ XLSX_COLUMNS = [
99
+ ("Fund", "name", 40, False, 0), # A β€” wide: long fund names
100
+ ("Benchmark Type", "benchmark", 22, False, 0), # B
101
+ ("TER", "ter", 9, True, 4), # C
102
+ ("Turn over (%)", "turnover", 11, True, 2), # D
103
+ ("Mean", "mean", 9, False, 2), # E
104
+ ("1 Year CAGR", "cagr_1y", 10, False, 2), # F
105
+ ("3 Years CAGR", "cagr_3y", 10, False, 2), # G
106
+ ("5 Years CAGR", "cagr_5y", 10, False, 2), # H
107
+ ("10 Years CAGR", "cagr_10y", 11, False, 2), # I
108
+ ("CAGR Since Inception", "cagr_inception", 14, False, 2), # J
109
+ ("NAV", "nav", 10, False, 2), # K
110
+ ("P/E Ratio", "pe_ratio", 10, False, 2), # L
111
+ ("P/B Ratio", "pb_ratio", 10, False, 2), # M
112
+ ("Alpha", "alpha", 10, False, 2), # N
113
+ ("Volatility", "volatility", 10, False, 2), # O
114
+ ("Beta", "beta", 9, False, 2), # P
115
+ ("Standard Deviation", "std_dev", 14, False, 2), # Q
116
+ ("Sharpe Ratio", "sharpe", 11, False, 2), # R
117
+ ("Sortino Ratio", "sortino", 11, False, 2), # S
118
+ ("Up Market Capture", "up_capture", 14, False, 2), # T
119
+ ("Down Market Capture", "down_capture", 16, False, 2), # U
120
+ ("Maximum Drawdown", "max_drawdown", 15, False, 2), # V
121
+ ("R-Squared", "r_squared", 11, False, 2), # W
122
+ ("Information Ratio", "info_ratio", 14, False, 2), # X
123
+ ("Total Assets (in Cr)", "aum", 16, False, 1), # Y
124
+ ("Weightage", "weightage", 11, False, 3), # Z β€” 3dp for precision
125
+ ]
126
+
127
+ NUM_COLS = len(XLSX_COLUMNS)
128
+
129
+
130
+ def _to_float(val) -> Optional[float]:
131
+ """Safely convert raw CSV value to float."""
132
+ if val is None:
133
+ return None
134
+ s = str(val).strip().replace('%', '').replace(',', '')
135
+ if s in ('', '-', 'N/A*', 'N/A', 'nan', 'None'):
136
+ return None
137
+ try:
138
+ return float(s)
139
+ except ValueError:
140
+ return None
141
+
142
+
143
+ def _parse_ter(val) -> Optional[float]:
144
+ """Parse TER value - CSV has percentage format like '1.40%', convert to decimal."""
145
+ if val is None:
146
+ return None
147
+ # Check if percentage BEFORE stripping
148
+ is_pct = '%' in str(val)
149
+ s = str(val).strip().replace('%', '').replace(',', '')
150
+ if s in ('', '-', 'N/A*', 'N/A', 'nan', 'None'):
151
+ return None
152
+ try:
153
+ v = float(s)
154
+ # Convert percentage to decimal (e.g., 1.40 -> 0.014)
155
+ if is_pct:
156
+ v = v / 100
157
+ return v
158
+ except ValueError:
159
+ return None
160
+
161
+
162
+ def _parse_turnover(val) -> Optional[float]:
163
+ """Parse turnover value - CSV has percentage format like '20%', convert to decimal."""
164
+ if val is None:
165
+ return None
166
+ # Check if percentage BEFORE stripping
167
+ is_pct = '%' in str(val)
168
+ s = str(val).strip().replace('%', '').replace(',', '')
169
+ if s in ('', '-', 'N/A*', 'N/A', 'nan', 'None'):
170
+ return None
171
+ try:
172
+ v = float(s)
173
+ # Convert percentage to decimal (e.g., 20 -> 0.20)
174
+ if is_pct:
175
+ v = v / 100
176
+ return v
177
+ except ValueError:
178
+ return None
179
+
180
+
181
+ def _parse_launch_date(val) -> Optional[datetime]:
182
+ """Parse launch date from CSV into datetime."""
183
+ if val is None:
184
+ return None
185
+ s = str(val).strip()
186
+ if not s or s in ("-", "N/A", "N/A*"):
187
+ return None
188
+ for fmt in ("%d-%m-%Y", "%Y-%m-%d", "%d/%m/%Y"):
189
+ try:
190
+ return datetime.strptime(s, fmt)
191
+ except ValueError:
192
+ continue
193
+ return None
194
+
195
+
196
+ # ─── Auto-calculation for incomplete sections ────────────────────────────────────
197
+
198
+ def _calculate_category_averages(funds: List[Fund]) -> Dict[str, Dict[str, Any]]:
199
+ """
200
+ Calculate category averages from fund-level category CAGR values.
201
+ For categories without official data, extract category average values from fund rows.
202
+ Uses the FIRST fund's category average value for each period.
203
+ """
204
+ categories: Dict[str, List[Fund]] = {}
205
+
206
+ # Group funds by category
207
+ for fund in funds:
208
+ if fund.category not in categories:
209
+ categories[fund.category] = []
210
+ categories[fund.category].append(fund)
211
+
212
+ cat_avg_data: Dict[str, Dict[str, Any]] = {}
213
+
214
+ for cat_name, cat_funds in categories.items():
215
+ if not cat_funds:
216
+ continue
217
+
218
+ # Use the FIRST fund's category average values
219
+ # This matches the CSV structure where all funds should have the same category average
220
+ first_fund = cat_funds[0]
221
+
222
+ cat_avg_data[cat_name] = {
223
+ 'cagr_1y': first_fund.cagr_1y_cat if first_fund.cagr_1y_cat and first_fund.cagr_1y_cat != 0 else None,
224
+ 'cagr_3y': first_fund.cagr_3y_cat if first_fund.cagr_3y_cat and first_fund.cagr_3y_cat != 0 else None,
225
+ 'cagr_5y': first_fund.cagr_5y_cat if first_fund.cagr_5y_cat and first_fund.cagr_5y_cat != 0 else None,
226
+ 'cagr_10y': first_fund.cagr_10y_cat if first_fund.cagr_10y_cat and first_fund.cagr_10y_cat != 0 else None,
227
+ 'pe_ratio': None,
228
+ 'pb_ratio': None,
229
+ 'is_calculated': True # Flag to indicate this is calculated from fund data
230
+ }
231
+
232
+ return cat_avg_data
233
+
234
+
235
+ def _calculate_benchmark_index(funds: List[Fund]) -> Dict[str, Dict[str, Any]]:
236
+ """
237
+ Calculate BM Index from fund-level benchmark CAGR values.
238
+ For categories without a BM Index row in CSV, extract benchmark values from fund rows.
239
+ Uses the FIRST fund's benchmark value for each period.
240
+ """
241
+ categories: Dict[str, List[Fund]] = {}
242
+
243
+ # Group funds by category
244
+ for fund in funds:
245
+ if fund.category not in categories:
246
+ categories[fund.category] = []
247
+ categories[fund.category].append(fund)
248
+
249
+ bm_data: Dict[str, Dict[str, Any]] = {}
250
+
251
+ for cat_name, cat_funds in categories.items():
252
+ if not cat_funds:
253
+ continue
254
+
255
+ # Use the FIRST fund's benchmark values
256
+ # This matches the CSV structure where we take the first fund's data
257
+ first_fund = cat_funds[0]
258
+
259
+ bm_data[cat_name] = {
260
+ 'cagr_1y': first_fund.cagr_1y_bm if first_fund.cagr_1y_bm is not None else None,
261
+ 'cagr_3y': first_fund.cagr_3y_bm if first_fund.cagr_3y_bm is not None else None,
262
+ 'cagr_5y': first_fund.cagr_5y_bm if first_fund.cagr_5y_bm is not None else None,
263
+ 'cagr_10y': first_fund.cagr_10y_bm if first_fund.cagr_10y_bm is not None else None,
264
+ 'is_calculated': True # Flag to indicate this is calculated from fund data
265
+ }
266
+
267
+ return bm_data
268
+
269
+
270
+ # ─── CSV Loader ───────────────────────────────────────────────────────────────────
271
+
272
+ def load_fund_csv(csv_path: str) -> Tuple[List[Fund], Dict[str, Dict[str, Any]], Dict[str, Dict[str, Any]], Dict[str, int]]:
273
+ """
274
+ Parse the fund-stats CSV and merge with reference data from Processed_data.xlsx.
275
+ For sections with missing reference data, auto-calculates category averages from fund data.
276
+ Returns: (funds, bm_data, cat_avg_data, fund_weightages)
277
+ """
278
+ csv_path = Path(csv_path)
279
+ if not csv_path.exists():
280
+ raise FileNotFoundError(f"CSV not found: {csv_path}")
281
+
282
+ # Load reference data from Processed_data.xlsx
283
+ ref_bm_data, ref_cat_avg_data, ref_fund_weightages = extract_reference_data(DEFAULT_REFERENCE_PATH)
284
+
285
+ funds: List[Fund] = []
286
+ current_category = "Unknown"
287
+ bm_data: Dict[str, Dict[str, Any]] = {}
288
+ cat_avg_data: Dict[str, Dict[str, Any]] = {}
289
+
290
+ with open(csv_path, encoding='utf-8-sig', errors='replace') as f:
291
+ reader = csv.reader(f)
292
+ rows = list(reader)
293
+
294
+ # DYNAMIC COLUMN DETECTION - Read header row first
295
+ if not rows:
296
+ raise ValueError("CSV file is empty")
297
+
298
+ header = [str(col).strip() for col in rows[0]]
299
+ col_map = {name: idx for idx, name in enumerate(header)}
300
+
301
+ print(f"Detected CSV format with {len(header)} columns")
302
+
303
+ # Detect format based on column names
304
+ has_category_col = 'Category' in col_map
305
+ has_scheme_code = 'Scheme Code' in col_map
306
+
307
+ if has_category_col and has_scheme_code:
308
+ print(" Format: NEW (36 columns with Category column)")
309
+ else:
310
+ print(" Format: OLD (35 columns without Category column)")
311
+
312
+ pending_bm: Dict[str, Dict[str, Any]] = {}
313
+ pending_cat_avg: Dict[str, Dict[str, Any]] = {}
314
+ seen_fund_category: set[tuple[str, str]] = set()
315
+ deduped_rows = 0
316
+
317
+ # Helper to get column index safely
318
+ def get_col_idx(col_name: str) -> Optional[int]:
319
+ return col_map.get(col_name)
320
+
321
+ for row_idx, row in enumerate(rows):
322
+ if row_idx == 0: # Skip header row
323
+ continue
324
+
325
+ if not row:
326
+ continue
327
+
328
+ col0 = str(row[0]).strip()
329
+
330
+ # Category header - detect by checking if most columns are empty
331
+ # Category headers are standalone rows with category name in col0 and empty data columns
332
+ # This catches: "Equity: Large Cap", "Childrens Fund", "ETFs", "Retirement Fund", etc.
333
+ # But NOT "BM Index" or "Category Average" rows
334
+ if col0 not in ('BM Index', 'Category Average', '', 'nan'):
335
+ # Check if this looks like a category header (columns 2-10 are empty)
336
+ # For old format: check columns 2-10 (Benchmark Type is col 1, so skip it)
337
+ # For new format: check columns 2-10 (Category is col 1, so skip it)
338
+ check_cols = row[2:11] if len(row) > 10 else row[2:6]
339
+ non_empty_count = sum(1 for cell in check_cols if str(cell).strip() not in ('', 'nan', 'None', '-'))
340
+
341
+ if non_empty_count == 0 and len(col0) > 3: # All checked columns are empty - this is a category header
342
+ current_category = col0
343
+
344
+ # Use reference data if available, otherwise use CSV data (which may be empty)
345
+ if current_category in ref_bm_data:
346
+ pending_bm[current_category] = ref_bm_data[current_category]
347
+ else:
348
+ pending_bm[current_category] = None
349
+
350
+ if current_category in ref_cat_avg_data:
351
+ pending_cat_avg[current_category] = ref_cat_avg_data[current_category]
352
+ else:
353
+ pending_cat_avg[current_category] = None
354
+ continue
355
+
356
+ # BM Index row - skip, we're using reference data
357
+ if col0 == 'BM Index':
358
+ continue
359
+
360
+ # Category Average row - skip, we're using reference data
361
+ if col0 == 'Category Average':
362
+ continue
363
+
364
+ # Skip header rows (repeated headers in CSV)
365
+ if col0 == 'Fund' and len(row) > 1:
366
+ # Check if this is a header row by looking at column 1
367
+ col1 = str(row[1]).strip() if len(row) > 1 else ''
368
+ if col1 in ('Benchmark Type', 'Category'):
369
+ continue
370
+
371
+ if col0 in ('', 'nan'):
372
+ continue
373
+
374
+ # Parse fund using dynamic column mapping
375
+ def g(col_name: str) -> Optional[float]:
376
+ idx = get_col_idx(col_name)
377
+ if idx is None:
378
+ return None
379
+ try:
380
+ return _to_float(row[idx])
381
+ except (IndexError, TypeError):
382
+ return None
383
+
384
+ def get_str(col_name: str) -> str:
385
+ idx = get_col_idx(col_name)
386
+ if idx is None:
387
+ return ""
388
+ try:
389
+ return str(row[idx]).strip()
390
+ except (IndexError, TypeError):
391
+ return ""
392
+
393
+ # Get category - either from Category column or from current_category
394
+ if has_category_col:
395
+ fund_category = get_str('Category') or current_category
396
+ else:
397
+ fund_category = current_category
398
+
399
+ # Get benchmark
400
+ benchmark = get_str('Benchmark Type')
401
+
402
+ # Get TER and Turnover with special parsing
403
+ ter_idx = get_col_idx('TER')
404
+ ter_val = _parse_ter(row[ter_idx]) if ter_idx is not None and len(row) > ter_idx else None
405
+
406
+ turnover_idx = get_col_idx('Turn over (%)')
407
+ turnover_val = _parse_turnover(row[turnover_idx]) if turnover_idx is not None and len(row) > turnover_idx else None
408
+
409
+ dedupe_key = (col0.strip().lower(), fund_category.strip().lower())
410
+ if dedupe_key in seen_fund_category:
411
+ deduped_rows += 1
412
+ continue
413
+ seen_fund_category.add(dedupe_key)
414
+
415
+ fund = Fund(
416
+ name=col0,
417
+ category=fund_category,
418
+ benchmark=benchmark,
419
+ ter=ter_val,
420
+ turnover=turnover_val,
421
+ mean=g('Mean'),
422
+ cagr_1y=g('1 Year CAGR'),
423
+ cagr_1y_cat=g('1 Year Category CAGR'),
424
+ cagr_1y_bm=g('1 Year Benchmark CAGR'),
425
+ cagr_3y=g('3 Years CAGR'),
426
+ cagr_3y_cat=g('3 Years Category CAGR'),
427
+ cagr_3y_bm=g('3 Years Benchmark CAGR'),
428
+ cagr_5y=g('5 Years CAGR'),
429
+ cagr_5y_cat=g('5 Years Category CAGR'),
430
+ cagr_5y_bm=g('5 Years Benchmark CAGR'),
431
+ cagr_10y=g('10 Years CAGR'),
432
+ cagr_10y_cat=g('10 Years Category CAGR'),
433
+ cagr_10y_bm=g('10 Years Benchmark CAGR'),
434
+ cagr_inception=g('CAGR Since Inception'),
435
+ nav=g('NAV'),
436
+ pe_ratio=g('P/E Ratio'),
437
+ pb_ratio=g('P/B Ratio'),
438
+ alpha=g('Alpha'),
439
+ beta=g('Beta'),
440
+ std_dev=g('Standard Deviation'),
441
+ sharpe=g('Sharpe Ratio'),
442
+ volatility=g('Volatility'),
443
+ sortino=g('Sortino Ratio'),
444
+ up_capture=g('Up Market Capture\nRatio') or g('Up Market Capture'),
445
+ down_capture=g('Down Market Capture\nRatio') or g('Down Market Capture'),
446
+ max_drawdown=g('Maximum Drawdown'),
447
+ r_squared=g('R-Squared'),
448
+ info_ratio=g('Information Ratio'),
449
+ aum=g('Total Assets (in Cr)'),
450
+ fill_status=get_str('Fill Status') or None,
451
+ )
452
+ # Preserve scheme code for downstream NAV / drawdown fixes
453
+ scheme_code_str = get_str('Scheme Code')
454
+ if scheme_code_str:
455
+ setattr(fund, "_scheme_code", scheme_code_str)
456
+ launch_dt = _parse_launch_date(get_str('Launch Date'))
457
+ if launch_dt:
458
+ setattr(fund, "_launch_date", launch_dt)
459
+ fund.order = len(funds) # Preserve original CSV order for tiebreaker
460
+ funds.append(fund)
461
+
462
+ if deduped_rows:
463
+ print(f" Deduplicated {deduped_rows} rows by (Fund, Category) at ingest")
464
+
465
+ # Calculate category averages from fund data
466
+ calculated_cat_avg = _calculate_category_averages(funds)
467
+
468
+ # Calculate BM Index from fund-level benchmark data
469
+ calculated_bm = _calculate_benchmark_index(funds)
470
+
471
+ # Assign BM and Category Average data - ONLY use calculated data from CSV
472
+ # DO NOT use reference data from Processed_data.xlsx
473
+ for cat_name in set(f.category for f in funds):
474
+ # BM Index: Always use calculated data from fund benchmark values
475
+ bm_data[cat_name] = calculated_bm.get(cat_name, {})
476
+
477
+ # Category Average: Always use calculated data from fund category values
478
+ cat_avg_data[cat_name] = calculated_cat_avg.get(cat_name, {})
479
+
480
+ return funds, bm_data, cat_avg_data, ref_fund_weightages
481
+
482
+
483
+ def _fmt(val, decimals=2) -> Optional[float]:
484
+ """Return rounded float or None."""
485
+ if val is None:
486
+ return None
487
+ try:
488
+ return round(float(val), decimals)
489
+ except (ValueError, TypeError):
490
+ return None
491
+
492
+
493
+ def _quartile_band_for_position(pos: int, total: int) -> Optional[int]:
494
+ """
495
+ Return quartile band by positional rank (0-based) after sorting by score desc.
496
+
497
+ Band mapping:
498
+ - 0: Top quartile (Green)
499
+ - 1: Upper-middle quartile (Yellow)
500
+ - 2: Lower-middle quartile (Orange)
501
+ - 3: Bottom quartile (Red)
502
+
503
+ Uses rank-positioning (not score thresholds), so ties do not distort quartile sizes.
504
+ """
505
+ if total <= 0 or pos < 0 or pos >= total:
506
+ return None
507
+
508
+ # Keep intuitive behavior for tiny categories.
509
+ if total == 1:
510
+ return 0
511
+ if total == 2:
512
+ return 0 if pos == 0 else 3
513
+ if total == 3:
514
+ if pos == 0:
515
+ return 0
516
+ if pos == 1:
517
+ return 1
518
+ return 3
519
+
520
+ q1_end = math.ceil(total * 0.25)
521
+ q2_end = math.ceil(total * 0.50)
522
+ q3_end = math.ceil(total * 0.75)
523
+
524
+ if pos < q1_end:
525
+ return 0
526
+ if pos < q2_end:
527
+ return 1
528
+ if pos < q3_end:
529
+ return 2
530
+ return 3
531
+
532
+
533
+ def _calculate_weightage(fund: Fund, cat_avg_vals: Dict[str, Any]) -> int:
534
+ """
535
+ DEPRECATED: Legacy CAGR-based weightage calculation.
536
+ Use compute_scores() from weightage.py for AI-suggested model.
537
+
538
+ Calculate weightage based on period-weighted scoring against Category Average.
539
+
540
+ Period weights:
541
+ - 1 Year CAGR: 2 pts if fund beats Category Average
542
+ - 3 Years CAGR: 3 pts if fund beats Category Average
543
+ - 5 Years CAGR: 4 pts if fund beats Category Average
544
+ - 10 Years CAGR: 5 pts if fund beats Category Average
545
+
546
+ Max possible: 14 pts
547
+ Note: Treat 0, N/A*, or - as "no data" (skip comparison)
548
+ """
549
+ weightage = 0
550
+
551
+ # Period weights mapping
552
+ period_weights = {
553
+ 'cagr_1y': 2,
554
+ 'cagr_3y': 3,
555
+ 'cagr_5y': 4,
556
+ 'cagr_10y': 5,
557
+ }
558
+
559
+ for attr, weight in period_weights.items():
560
+ fund_val = getattr(fund, attr, None)
561
+ cat_avg_val = cat_avg_vals.get(attr) if cat_avg_vals else None
562
+
563
+ # Skip if fund value is 0, None, or invalid
564
+ if fund_val is None or fund_val == 0:
565
+ continue
566
+ if cat_avg_val is None or cat_avg_val == 0:
567
+ continue
568
+
569
+ # Award points if fund beats category average
570
+ if fund_val > cat_avg_val:
571
+ weightage += weight
572
+
573
+ return weightage
574
+
575
+
576
+ def _calculate_green_cell_weightage(fund: Fund, all_funds_in_category: List[Fund]) -> int:
577
+ """
578
+ Calculate weightage as the count of GREEN cells (top 10 rankings).
579
+
580
+ Matches Excel conditional formatting rules:
581
+ - Only metrics with GREEN highlighting are counted
582
+ - Bottom 10 metrics get RED highlighting (not counted)
583
+
584
+ GREEN metrics (Top 10 = Green):
585
+ - CAGR columns: F, G, H, I (1Y, 3Y, 5Y, 10Y)
586
+ - Top 10 columns: J, N, R, S, T, X, Y (Inception, Alpha, Sharpe, Sortino, UpCapture, InfoRatio, Assets)
587
+
588
+ Total possible: 11 green cells
589
+ """
590
+ green_count = 0
591
+
592
+ # Only metrics that get GREEN highlighting in Excel (Top 10 = Green)
593
+ green_metrics = [
594
+ 'cagr_1y', # Column F
595
+ 'cagr_3y', # Column G
596
+ 'cagr_5y', # Column H
597
+ 'cagr_10y', # Column I
598
+ 'cagr_inception', # Column J
599
+ 'alpha', # Column N
600
+ 'sharpe', # Column R
601
+ 'sortino', # Column S
602
+ 'up_capture', # Column T
603
+ 'info_ratio', # Column X
604
+ 'aum' # Column Y (Assets)
605
+ ]
606
+
607
+ # Check each metric that gets GREEN highlighting
608
+ for metric in green_metrics:
609
+ if _is_in_top_10(fund, all_funds_in_category, metric, higher_is_better=True):
610
+ green_count += 1
611
+
612
+ return green_count
613
+
614
+
615
+ def _is_in_top_10(fund: Fund, all_funds: List[Fund], metric: str, higher_is_better: bool) -> bool:
616
+ """
617
+ Check if a fund is in top 10 for a given metric within its category.
618
+
619
+ Args:
620
+ fund: The fund to check
621
+ all_funds: All funds in the same category
622
+ metric: The metric attribute name (e.g., 'cagr_1y', 'ter')
623
+ higher_is_better: True if higher values are better, False if lower is better
624
+
625
+ Returns: True if fund is in top 10, False otherwise
626
+ """
627
+ fund_val = getattr(fund, metric, None)
628
+
629
+ # Skip if fund doesn't have this metric
630
+ if fund_val is None or fund_val == 0:
631
+ return False
632
+
633
+ # Collect all valid values for this metric in the category
634
+ valid_values = []
635
+ for f in all_funds:
636
+ val = getattr(f, metric, None)
637
+ if val is not None and val != 0:
638
+ valid_values.append(val)
639
+
640
+ # Need at least 10 funds with data to have a top 10
641
+ if len(valid_values) < 10:
642
+ # If fewer than 10 funds, check if fund is in top half
643
+ if len(valid_values) < 2:
644
+ return False
645
+ valid_values.sort(reverse=higher_is_better)
646
+ threshold_idx = len(valid_values) // 2
647
+ threshold = valid_values[threshold_idx]
648
+ if higher_is_better:
649
+ return fund_val >= threshold
650
+ else:
651
+ return fund_val <= threshold
652
+
653
+ # Sort values to find top 10 threshold
654
+ valid_values.sort(reverse=higher_is_better)
655
+
656
+ # Count how many funds are strictly better than this fund
657
+ if higher_is_better:
658
+ better_count = sum(1 for v in valid_values if v > fund_val)
659
+ else:
660
+ better_count = sum(1 for v in valid_values if v < fund_val)
661
+
662
+ # Fund is in top 10 if 9 or fewer funds are strictly better (ranks 1-10)
663
+ return better_count <= 9
664
+
665
+
666
+ def _get_cagr_font_color() -> Font:
667
+ """
668
+ NO font coloring - always return default black font.
669
+ Per instructions: "CRITICAL: NO green/red font coloring anywhere"
670
+ """
671
+ return FONT_DEFAULT
672
+
673
+
674
+ def _apply_conditional_formatting(ws, start_row: int, end_row: int, cat_avg_vals: Dict[str, Any]):
675
+ """
676
+ Apply conditional formatting rules per MF_Scoring_Model.md
677
+
678
+ Light Green (C6EFCE) + Dark Green Text (006100) for:
679
+ - Top 10: CAGR (all periods), Alpha, Sharpe, Sortino, Up Capture, R-Squared, Info Ratio, Total Assets, CAGR Since Inception
680
+ - Bottom 10: TER, Turnover, Beta, Std Dev, Down Capture, P/E, P/B, Max Drawdown
681
+
682
+ Light Red (FFC7CE) for threshold violations:
683
+ - Alpha < 1
684
+ - Info Ratio < 0
685
+ - CAGR < Category Average (all periods)
686
+ """
687
+ if start_row >= end_row:
688
+ return
689
+
690
+ # Define colors for conditional formatting
691
+ green_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
692
+ green_font = Font(color="006100")
693
+ red_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
694
+ red_font = Font(color="9C0006")
695
+
696
+ # ═══════════════════════════════════════════════════════════════════════════
697
+ # DUAL-CONDITION COLUMNS (Green for Top 10, Red for threshold violations)
698
+ # ═══════════════════════════════════════════════════════════════════════════
699
+
700
+ # CAGR columns: Green for Top 10, Red if < Category Average
701
+ cagr_cols = {
702
+ 'F': (6, cat_avg_vals.get('cagr_1y')), # 1 Year CAGR
703
+ 'G': (7, cat_avg_vals.get('cagr_3y')), # 3 Years CAGR
704
+ 'H': (8, cat_avg_vals.get('cagr_5y')), # 5 Years CAGR
705
+ 'I': (9, cat_avg_vals.get('cagr_10y')), # 10 Years CAGR
706
+ }
707
+
708
+ for col_letter, (col_num, cat_avg) in cagr_cols.items():
709
+ range_str = f"{col_letter}{start_row}:{col_letter}{end_row}"
710
+
711
+ # Rule 1: Red if < Category Average (higher priority)
712
+ if cat_avg is not None:
713
+ rule_red = CellIsRule(
714
+ operator='lessThan',
715
+ formula=[str(cat_avg)],
716
+ stopIfTrue=True, # Stop if red applies
717
+ fill=red_fill,
718
+ font=red_font
719
+ )
720
+ ws.conditional_formatting.add(range_str, rule_red)
721
+
722
+ # Rule 2: Green for Top 10
723
+ rule_green = Rule(
724
+ type='top10',
725
+ rank=10,
726
+ stopIfTrue=False
727
+ )
728
+ rule_green.dxf = DifferentialStyle(fill=green_fill, font=green_font)
729
+ ws.conditional_formatting.add(range_str, rule_green)
730
+
731
+ # Alpha (Col N = 14): Green for Top 10, Red if < 1
732
+ range_str = f"N{start_row}:N{end_row}"
733
+ rule_red = CellIsRule(
734
+ operator='lessThan',
735
+ formula=['1'],
736
+ stopIfTrue=True,
737
+ fill=red_fill,
738
+ font=red_font
739
+ )
740
+ ws.conditional_formatting.add(range_str, rule_red)
741
+
742
+ rule_green = Rule(type='top10', rank=10, stopIfTrue=False)
743
+ rule_green.dxf = DifferentialStyle(fill=green_fill, font=green_font)
744
+ ws.conditional_formatting.add(range_str, rule_green)
745
+
746
+ # Information Ratio (Col X = 24): Green for Top 10, Red if < 0
747
+ range_str = f"X{start_row}:X{end_row}"
748
+ rule_red = CellIsRule(
749
+ operator='lessThan',
750
+ formula=['0'],
751
+ stopIfTrue=True,
752
+ fill=red_fill,
753
+ font=red_font
754
+ )
755
+ ws.conditional_formatting.add(range_str, rule_red)
756
+
757
+ rule_green = Rule(type='top10', rank=10, stopIfTrue=False)
758
+ rule_green.dxf = DifferentialStyle(fill=green_fill, font=green_font)
759
+ ws.conditional_formatting.add(range_str, rule_green)
760
+
761
+ # ═══════════════════════════════════════════════════════════════════════════
762
+ # TOP 10 COLUMNS (Green - Higher is Better)
763
+ # ═══════════════════════════════════════════════════════════════════════════
764
+
765
+ top10_cols = {
766
+ 'J': 'CAGR Since Inception',
767
+ 'R': 'Sharpe Ratio',
768
+ 'S': 'Sortino Ratio',
769
+ 'T': 'Up Market Capture',
770
+ 'W': 'R-Squared',
771
+ 'Y': 'Total Assets'
772
+ }
773
+
774
+ for col_letter, name in top10_cols.items():
775
+ range_str = f"{col_letter}{start_row}:{col_letter}{end_row}"
776
+ rule = Rule(type='top10', rank=10, stopIfTrue=False)
777
+ rule.dxf = DifferentialStyle(fill=green_fill, font=green_font)
778
+ ws.conditional_formatting.add(range_str, rule)
779
+
780
+ # Maximum Drawdown (Col V): Top 10 among NON-ZERO values only.
781
+ # This keeps zeros as "no data" and avoids green highlighting for zero entries.
782
+ v_range = f"V{start_row}:V{end_row}"
783
+ # Guard against text placeholders like "NA": Excel treats "NA" <> 0 as TRUE,
784
+ # which can incorrectly qualify the cell for highlighting. Only numeric values participate.
785
+ v_formula = (
786
+ f'AND('
787
+ f'ISNUMBER(V{start_row}),'
788
+ f'V{start_row}<>0,'
789
+ f'COUNTIFS($V${start_row}:$V${end_row},\">\"&V{start_row},$V${start_row}:$V${end_row},\"<>0\")<10'
790
+ f')'
791
+ )
792
+ v_rule = FormulaRule(formula=[v_formula], stopIfTrue=False, fill=green_fill, font=green_font)
793
+ ws.conditional_formatting.add(v_range, v_rule)
794
+
795
+ # ═══════════════════════════════════════════════════════════════════════════
796
+ # BOTTOM 10 COLUMNS (Green - Lower is Better)
797
+ # ═══════════════════════════════════════════════════════════════════════════
798
+
799
+ bottom10_cols = {
800
+ 'C': 'TER',
801
+ 'D': 'Turnover',
802
+ 'L': 'P/E Ratio',
803
+ 'P': 'Beta',
804
+ 'Q': 'Standard Deviation',
805
+ 'U': 'Down Market Capture'
806
+ }
807
+
808
+ for col_letter, name in bottom10_cols.items():
809
+ range_str = f"{col_letter}{start_row}:{col_letter}{end_row}"
810
+ rule = Rule(
811
+ type='top10',
812
+ rank=10,
813
+ bottom=True, # Bottom 10 = lowest values
814
+ stopIfTrue=False
815
+ )
816
+ rule.dxf = DifferentialStyle(fill=green_fill, font=green_font)
817
+ ws.conditional_formatting.add(range_str, rule)
818
+
819
+
820
+ def export_excel(funds: List[Fund], output_path: str,
821
+ bm_data: Dict[str, Dict[str, Any]] = None,
822
+ cat_avg_data: Dict[str, Dict[str, Any]] = None) -> str:
823
+ """Build the processed Excel matching target format exactly."""
824
+ output_path = Path(output_path)
825
+ output_path.parent.mkdir(parents=True, exist_ok=True)
826
+
827
+ if bm_data is None:
828
+ bm_data = {}
829
+ if cat_avg_data is None:
830
+ cat_avg_data = {}
831
+
832
+ wb = Workbook()
833
+ ws = wb.active
834
+ ws.title = "Sheet2"
835
+ na_audit_rows: List[str] = []
836
+
837
+ # Apply NA policy to all numeric export columns.
838
+ # Exclusions are text/derived columns that should stay as-is.
839
+ na_on_zero_attrs = {
840
+ attr for _, attr, _, _, _ in XLSX_COLUMNS
841
+ if attr and attr not in {"name", "benchmark", "weightage"}
842
+ }
843
+ cagr_period_by_attr = {
844
+ "cagr_1y": 1,
845
+ "cagr_3y": 3,
846
+ "cagr_5y": 5,
847
+ "cagr_10y": 10,
848
+ }
849
+
850
+ def _years_since_launch(fund_obj: Fund) -> Optional[float]:
851
+ launch_dt = getattr(fund_obj, "_launch_date", None)
852
+ if not isinstance(launch_dt, datetime):
853
+ return None
854
+ return max(0.0, (datetime.now() - launch_dt).days / 365.25)
855
+
856
+ def _audit_na(row_type: str, category: str, fund_name: str, attr: str, reason: str) -> None:
857
+ na_audit_rows.append(
858
+ f"{row_type}\t{category}\t{fund_name}\t{attr}\t{reason}"
859
+ )
860
+
861
+ def _display_numeric_or_na(
862
+ *,
863
+ attr: str,
864
+ value: Any,
865
+ row_type: str,
866
+ category: str,
867
+ fund_obj: Optional[Fund] = None,
868
+ fund_name: str = "",
869
+ decimals: int = 2,
870
+ ) -> Any:
871
+ """
872
+ Convert numeric value to rounded float or 'NA' for missing/invalid values.
873
+ Also appends NA decisions to audit rows.
874
+ Category Average: PE and PB show blank (not NA) when missing.
875
+ """
876
+ # Category Average row: PE and PB stay blank when missing
877
+ if row_type == "CATEGORY_AVG" and attr in ("pe_ratio", "pb_ratio"):
878
+ if value is None:
879
+ return None
880
+ try:
881
+ num = float(value)
882
+ return round(num, decimals) if num != 0 else None
883
+ except (TypeError, ValueError):
884
+ return None
885
+
886
+ if attr in na_on_zero_attrs:
887
+ if value is None:
888
+ _audit_na(row_type, category, fund_name, attr, "missing value")
889
+ return "NA"
890
+ try:
891
+ num = float(value)
892
+ except (TypeError, ValueError):
893
+ _audit_na(row_type, category, fund_name, attr, "non-numeric value")
894
+ return "NA"
895
+
896
+ if num == 0:
897
+ # Duration-aware reason for CAGR periods when launch date exists.
898
+ if fund_obj is not None and attr in cagr_period_by_attr:
899
+ years = _years_since_launch(fund_obj)
900
+ period = cagr_period_by_attr[attr]
901
+ if years is not None and years < period:
902
+ _audit_na(
903
+ row_type,
904
+ category,
905
+ fund_name,
906
+ attr,
907
+ f"fund age {years:.2f}y < required {period}y",
908
+ )
909
+ else:
910
+ _audit_na(row_type, category, fund_name, attr, "source value is 0")
911
+ else:
912
+ _audit_na(row_type, category, fund_name, attr, "source value is 0")
913
+ return "NA"
914
+
915
+ return round(num, decimals)
916
+
917
+ # Non-NA-managed attributes use existing behavior.
918
+ if value is None:
919
+ return None
920
+ try:
921
+ return round(float(value), decimals)
922
+ except (TypeError, ValueError):
923
+ return value
924
+
925
+ # ── Row 1: Column headers (include weight hints for scored metrics) ─────
926
+ ws.row_dimensions[1].height = 36
927
+ for col_idx, (header, attr, width, _, _) in enumerate(XLSX_COLUMNS, start=1):
928
+ # If this column participates in the scoring model, append its weight
929
+ # so the advisor can see weights even when scrolled deep into a category.
930
+ weight_hint = WEIGHT_REF_ROW.get(attr)
931
+ if weight_hint:
932
+ header_value = f"{header}\n({weight_hint})"
933
+ else:
934
+ header_value = header
935
+
936
+ cell = ws.cell(row=1, column=col_idx, value=header_value)
937
+ cell.fill = FILL_HEADER
938
+ cell.font = FONT_HEADER
939
+ cell.border = BORDER_THIN
940
+ cell.alignment = Alignment(horizontal="center", vertical="center", wrap_text=True)
941
+ ws.column_dimensions[get_column_letter(col_idx)].width = width
942
+
943
+ # Freeze col A + row 1 so fund names and headers stay visible while scrolling
944
+ ws.freeze_panes = "B2"
945
+
946
+ # ── Group funds by category ────────────────────────────────────────────────
947
+ categories: Dict[str, List[Fund]] = {}
948
+ category_order = []
949
+ for fund in funds:
950
+ if fund.category not in categories:
951
+ category_order.append(fund.category)
952
+ categories.setdefault(fund.category, []).append(fund)
953
+
954
+ current_row = 2
955
+
956
+ for idx, cat_name in enumerate(category_order):
957
+ cat_funds = categories[cat_name]
958
+
959
+ # Sort by score (displayed value) descending so Weightage column is strictly largest-to-lowest
960
+ sorted_funds = sorted(
961
+ cat_funds,
962
+ key=lambda f: (-(f.score or 0), (f.name or "").lower(), getattr(f, 'order', 0)),
963
+ )
964
+
965
+ # Quartiles by positional rank, not by score thresholds.
966
+ # This guarantees consistent quartile sizing even when many funds share the same score.
967
+ quartile_by_fund_id: Dict[int, int] = {}
968
+ for pos, fund in enumerate(sorted_funds):
969
+ band = _quartile_band_for_position(pos, len(sorted_funds))
970
+ if band is not None:
971
+ quartile_by_fund_id[id(fund)] = band
972
+
973
+ # ── Header row (repeat before each category except first) ─────────────
974
+ if idx > 0:
975
+ ws.row_dimensions[current_row].height = 32
976
+ for col_idx, (header, _, _, _, _) in enumerate(XLSX_COLUMNS, start=1):
977
+ cell = ws.cell(row=current_row, column=col_idx, value=header)
978
+ cell.fill = FILL_HEADER
979
+ cell.font = FONT_HEADER
980
+ cell.border = BORDER_THIN
981
+ cell.alignment = Alignment(horizontal="center", vertical="center", wrap_text=True)
982
+ current_row += 1
983
+
984
+ # ── Category header row ───────────────────────────────────────────────
985
+ ws.row_dimensions[current_row].height = 20
986
+ for col_idx in range(1, NUM_COLS + 1):
987
+ cell = ws.cell(row=current_row, column=col_idx)
988
+ cell.fill = FILL_WHITE
989
+ cell.border = BORDER_THIN
990
+ cat_cell = ws.cell(row=current_row, column=1, value=cat_name)
991
+ cat_cell.font = FONT_CAT_HEADER
992
+ cat_cell.alignment = Alignment(horizontal="left", vertical="center", wrap_text=True)
993
+ ws.merge_cells(start_row=current_row, start_column=1,
994
+ end_row=current_row, end_column=NUM_COLS - 1)
995
+ current_row += 1
996
+
997
+ # ── BM Index row ───────────────────────────────────────────────────────
998
+ bm_vals = bm_data.get(cat_name, {})
999
+ ws.row_dimensions[current_row].height = 14
1000
+ for col_idx, (header, attr, _, _, _) in enumerate(XLSX_COLUMNS, start=1):
1001
+ val = None
1002
+ if col_idx == 1:
1003
+ val = "BM Index"
1004
+ elif attr in bm_vals:
1005
+ val = _display_numeric_or_na(
1006
+ attr=attr,
1007
+ value=bm_vals[attr],
1008
+ row_type="BM_INDEX",
1009
+ category=cat_name,
1010
+ fund_name="BM Index",
1011
+ decimals=2,
1012
+ )
1013
+
1014
+ cell = ws.cell(row=current_row, column=col_idx, value=val)
1015
+ if col_idx == 1:
1016
+ cell.fill = FILL_BM_ROW
1017
+ elif col_idx in [6, 7, 8, 9]:
1018
+ cell.fill = FILL_BM_CAGR
1019
+ else:
1020
+ cell.fill = FILL_WHITE
1021
+ cell.font = FONT_DEFAULT_BOLD
1022
+ cell.border = BORDER_THIN
1023
+ cell.alignment = Alignment(
1024
+ horizontal="right" if col_idx > 2 else "left",
1025
+ vertical="center", wrap_text=(col_idx == 1)
1026
+ )
1027
+ current_row += 1
1028
+
1029
+ # ── Category Average row ──────────────────────────────────────────────
1030
+ cat_avg_vals = cat_avg_data.get(cat_name, {})
1031
+ ws.row_dimensions[current_row].height = 14
1032
+ for col_idx, (header, attr, _, _, _) in enumerate(XLSX_COLUMNS, start=1):
1033
+ val = None
1034
+ if col_idx == 1:
1035
+ val = "Category Average"
1036
+ elif attr in cat_avg_vals:
1037
+ val = _display_numeric_or_na(
1038
+ attr=attr,
1039
+ value=cat_avg_vals[attr],
1040
+ row_type="CATEGORY_AVG",
1041
+ category=cat_name,
1042
+ fund_name="Category Average",
1043
+ decimals=2,
1044
+ )
1045
+
1046
+ cell = ws.cell(row=current_row, column=col_idx, value=val)
1047
+ if col_idx == 1:
1048
+ cell.fill = FILL_CAT_AVG
1049
+ elif col_idx in [6, 7, 8, 9, 12, 13]:
1050
+ cell.fill = FILL_CAT_CAGR
1051
+ else:
1052
+ cell.fill = FILL_WHITE
1053
+ cell.font = FONT_DEFAULT_BOLD
1054
+ cell.border = BORDER_THIN
1055
+ cell.alignment = Alignment(
1056
+ horizontal="right" if col_idx > 2 else "left",
1057
+ vertical="center", wrap_text=(col_idx == 1)
1058
+ )
1059
+ current_row += 1
1060
+
1061
+ # ── Fund rows ─────────────────────────────────────────────────────────
1062
+ fund_start_row = current_row
1063
+
1064
+ top_5_fund_ids = {id(f) for f in sorted_funds[:5]}
1065
+
1066
+ for fund in sorted_funds:
1067
+ # 36pt height = comfortable 2-line display for long fund names
1068
+ # without the advisor needing to drag rows on macOS or Windows
1069
+ ws.row_dimensions[current_row].height = 36
1070
+
1071
+ weightage = fund.score or 0
1072
+ score_val = round(weightage, 3)
1073
+ is_top_5 = id(fund) in top_5_fund_ids
1074
+
1075
+ for col_idx, (header, attr, _, _, decimals) in enumerate(XLSX_COLUMNS, start=1):
1076
+ if attr == "weightage":
1077
+ val = score_val
1078
+ cell_font = FONT_DEFAULT_BOLD if is_top_5 else FONT_DEFAULT
1079
+ elif attr:
1080
+ raw_val = getattr(fund, attr, None)
1081
+ if attr in ('name', 'benchmark'):
1082
+ val = raw_val if raw_val else None
1083
+ cell_font = FONT_DEFAULT_BOLD if (col_idx == 1 and is_top_5) else FONT_DEFAULT
1084
+ else:
1085
+ val = _display_numeric_or_na(
1086
+ attr=attr,
1087
+ value=raw_val,
1088
+ row_type="FUND",
1089
+ category=fund.category,
1090
+ fund_obj=fund,
1091
+ fund_name=fund.name,
1092
+ decimals=decimals,
1093
+ )
1094
+ cell_font = FONT_DEFAULT
1095
+ else:
1096
+ val = None
1097
+ cell_font = FONT_DEFAULT
1098
+
1099
+ cell = ws.cell(row=current_row, column=col_idx, value=val)
1100
+
1101
+ if is_top_5 and col_idx == 1:
1102
+ cell.fill = FILL_WEIGHTED_YELLOW
1103
+ elif attr == "weightage":
1104
+ quartile_band = quartile_by_fund_id.get(id(fund))
1105
+ if quartile_band == 0: cell.fill = FILL_QUARTILE_GREEN
1106
+ elif quartile_band == 1: cell.fill = FILL_QUARTILE_YELLOW
1107
+ elif quartile_band == 2: cell.fill = FILL_QUARTILE_ORANGE
1108
+ elif quartile_band == 3: cell.fill = FILL_QUARTILE_RED
1109
+ else: cell.fill = FILL_WHITE
1110
+ else:
1111
+ cell.fill = FILL_WHITE
1112
+
1113
+ cell.font = cell_font
1114
+ cell.border = BORDER_THIN
1115
+ cell.alignment = Alignment(
1116
+ horizontal="left" if col_idx <= 2 else "right",
1117
+ vertical="top", # top-align so wrapped text reads naturally
1118
+ wrap_text=True, # prevents truncation on any screen or zoom level
1119
+ )
1120
+
1121
+ if col_idx == 3: cell.number_format = '0.00%'
1122
+ elif col_idx == 4: cell.number_format = '0.00%'
1123
+ elif attr == "weightage": cell.number_format = '0.000'
1124
+
1125
+ current_row += 1
1126
+
1127
+ # Apply conditional formatting to this section's fund rows
1128
+ fund_end_row = current_row - 1
1129
+ if fund_end_row >= fund_start_row and cat_avg_vals:
1130
+ _apply_conditional_formatting(ws, fund_start_row, fund_end_row, cat_avg_vals)
1131
+
1132
+ wb.save(str(output_path))
1133
+ if na_audit_rows:
1134
+ audit_path = output_path.with_name(f"{output_path.stem}_na_audit.txt")
1135
+ lines = [
1136
+ "NA AUDIT TRACE",
1137
+ f"Generated: {datetime.now().isoformat()}",
1138
+ "Columns: row_type<TAB>category<TAB>fund_name<TAB>metric_attr<TAB>reason",
1139
+ "-" * 80,
1140
+ *na_audit_rows,
1141
+ ]
1142
+ audit_path.write_text("\n".join(lines), encoding="utf-8")
1143
+ print(f"NA audit trace written: {audit_path}")
1144
+ return str(output_path)
1145
+
1146
+
1147
+ def _avg(values: List[Optional[float]]) -> Optional[float]:
1148
+ """Compute average of non-None values."""
1149
+ valid = [v for v in values if v is not None]
1150
+ if not valid:
1151
+ return None
1152
+ return round(sum(valid) / len(valid), 2)
1153
+
1154
+
1155
+ # ─── Pipeline entry ────────────────────────────────────────────────────────────────
1156
+
1157
+ def run_data_engine(csv_path: str,
1158
+ output_path: str = "output/fund_analysis.xlsx",
1159
+ use_comprehensive_scoring: bool = True) -> List[Fund]:
1160
+ """
1161
+ Full pipeline: load -> score -> export Excel.
1162
+
1163
+ Args:
1164
+ csv_path: Path to the fund-stats CSV file
1165
+ output_path: Path to save the output Excel file
1166
+ use_comprehensive_scoring: If True, uses AI-suggested model (10-point scale with Top/Bottom 10).
1167
+ If False, uses legacy CAGR-based weightage.
1168
+ """
1169
+ print(f"Loading fund data from: {csv_path}")
1170
+ funds, bm_data, cat_avg_data, ref_fund_weightages = load_fund_csv(csv_path)
1171
+ print(f" Loaded {len(funds)} fund schemes")
1172
+
1173
+ # Proactively fix zero / missing drawdown cells using live NAV history
1174
+ # so Maximum Drawdown can participate in scoring instead of staying at 0.
1175
+ try:
1176
+ fixed_mdd = drawdown_zero_fix(funds, verbose=True)
1177
+ if fixed_mdd:
1178
+ print(f" Fixed {fixed_mdd} zero/missing drawdown cells via NAV engine")
1179
+ except Exception as exc:
1180
+ print(f" WARNING: drawdown_zero_fix failed: {exc}")
1181
+
1182
+ if use_comprehensive_scoring:
1183
+ # Use AI-suggested model (10-point scale)
1184
+ print(" Using AI-suggested scoring model (10-point scale with Top/Bottom 10)...")
1185
+
1186
+ # Import and use the new compute_scores function
1187
+ funds = compute_scores(funds)
1188
+
1189
+ # Copy score to weightage field for Excel export compatibility
1190
+ for fund in funds:
1191
+ fund.weightage = int(round(fund.score)) if fund.score else 0
1192
+
1193
+ with_highlight = sum(1 for f in funds if (f.score or 0) > 8)
1194
+ print(f" Calculated AI-suggested weightage. {with_highlight} funds have score > 8")
1195
+ else:
1196
+ # Use legacy CAGR-based weightage
1197
+ print(" Using legacy CAGR-based weightage...")
1198
+ for fund in funds:
1199
+ cat_avg_vals = cat_avg_data.get(fund.category, {})
1200
+ fund.weightage = _calculate_weightage(fund, cat_avg_vals)
1201
+ fund.score = float(fund.weightage)
1202
+
1203
+ with_highlight = sum(1 for f in funds if (f.weightage or 0) > 8)
1204
+ print(f" Calculated weightage. {with_highlight} funds have weightage > 8")
1205
+
1206
+ print(f"Exporting processed Excel to: {output_path}")
1207
+ path = export_excel(funds, output_path, bm_data, cat_avg_data)
1208
+ print(f"Done! Saved: {path}")
1209
+
1210
+ return funds
src/index_fund_ingest.py ADDED
@@ -0,0 +1,354 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Index Fund Ingest β€” capture index funds the same way as raw CSV (mftool/AMFI).
3
+
4
+ Two sources:
5
+ - mftool (default): Same as raw CSV under PS β€” AMFI category 38 (Index Funds/ETFs).
6
+ Returns only the schemes AMFI lists under that category (curated, ~same count as
7
+ your fund-stats CSV Index Fund section). Output format matches PS: "Index Fund",
8
+ hyphenated fund names.
9
+ - mfapi: Search mfapi.in and filter by index; use when you need more schemes.
10
+
11
+ Usage:
12
+ python -m src.index_fund_ingest [--output index_funds.csv] # default: mftool
13
+ python -m src.index_fund_ingest --source mfapi [--limit 100] # mfapi search
14
+ Then: enrich the output CSV, merge into main fund CSV, run data_engine as usual.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import argparse
20
+ import csv
21
+ import re
22
+ import time
23
+ from datetime import datetime, timedelta
24
+ from pathlib import Path
25
+
26
+ import requests
27
+
28
+ # Same AMFI gateway as mftool (get_open_ended_other_scheme_performance)
29
+ AMFI_FUND_PERFORMANCE_URL = "https://www.amfiindia.com/gateway/pollingsebi/api/amfi/fundperformance"
30
+ AMFI_CATEGORY_OTHER = 5
31
+ AMFI_SUBCATEGORY_INDEX_FUNDS = 38 # "Index Funds/ETFs"
32
+
33
+ MFAPI_LIST = "https://api.mfapi.in/mf"
34
+ MFAPI_SEARCH = "https://api.mfapi.in/mf/search"
35
+ MFAPI_NAV = "https://api.mfapi.in/mf/{scheme_code}"
36
+ SLEEP = 0.3 # polite delay between API calls
37
+
38
+ # CSV headers matching project fund-stats CSV (must match data_engine / csv_enrichment)
39
+ FUND_CSV_HEADERS = [
40
+ "Fund", "Category", "Scheme Code", "Launch Date", "Total Assets (in Cr)",
41
+ "TER", "Turn over (%)", "CAGR Since Inception",
42
+ "1 Year CAGR", "1 Year Category CAGR", "1 Year Benchmark CAGR",
43
+ "3 Years CAGR", "3 Years Category CAGR", "3 Years Benchmark CAGR",
44
+ "5 Years CAGR", "5 Years Category CAGR", "5 Years Benchmark CAGR",
45
+ "10 Years CAGR", "10 Years Category CAGR", "10 Years Benchmark CAGR",
46
+ "Benchmark Type", "NAV", "Alpha", "Beta", "Standard Deviation",
47
+ "Sharpe Ratio", "Volatility", "Mean", "Sortino Ratio",
48
+ "Up Market Capture\nRatio", "Down Market Capture\nRatio",
49
+ "Maximum Drawdown", "R-Squared", "Information Ratio", "P/E Ratio", "P/B Ratio",
50
+ ]
51
+
52
+ # Raw CSV under PS uses "Index Fund" (no "Equity:" prefix) for this category
53
+ INDEX_FUND_CATEGORY_PS = "Index Fund"
54
+
55
+ # mfapi scheme_category (from NAV meta) -> our Category label
56
+ CATEGORY_MAP = {
57
+ "index fund": "Equity: Index Fund",
58
+ "index funds": "Equity: Index Fund",
59
+ "equity scheme - index fund": "Equity: Index Fund",
60
+ "equity scheme - index funds": "Equity: Index Fund",
61
+ }
62
+
63
+
64
+ def _to_hyphenated(name: str) -> str:
65
+ """Convert scheme name to hyphenated form like raw CSV under PS (e.g. DSP-Nifty-50-Index-Fund-Regular-Plan-Growth)."""
66
+ if not name:
67
+ return ""
68
+ # Replace spaces and multiple hyphens with single hyphen, strip
69
+ s = re.sub(r"[\s_]+", "-", name.strip())
70
+ return re.sub(r"-+", "-", s).strip("-")
71
+
72
+
73
+ def _get_amfi_report_date() -> str:
74
+ """DD-MMM-YYYY for AMFI API. Use last weekday (API returns empty for weekend dates)."""
75
+ today = datetime.now().date()
76
+ d = today
77
+ for _ in range(7):
78
+ if d.weekday() < 5: # Mon=0 .. Fri=4
79
+ break
80
+ d -= timedelta(days=1)
81
+ return d.strftime("%d-%b-%Y")
82
+
83
+ # Scheme name fragments -> Benchmark Type (for nav_metrics_engine)
84
+ # Order matters: more specific (e.g. Nifty 500) before generic (Nifty 50)
85
+ BENCHMARK_INFER = [
86
+ (r"nifty\s*500|nifty500", "Nifty 500"),
87
+ (r"nifty\s*200|nifty200", "Nifty 200"),
88
+ (r"nifty\s*100|nifty100", "Nifty 100"),
89
+ (r"nifty\s*next\s*50|nifty\s*junior|niftyjr", "Nifty Next 50"),
90
+ (r"nifty\s*50|nifty50", "Nifty 50"),
91
+ (r"nifty\s*midcap\s*150|midcap\s*150", "Nifty Midcap 150"),
92
+ (r"nifty\s*smallcap\s*250|smallcap\s*250", "Nifty Smallcap 250"),
93
+ (r"sensex|bse\s*sensex", "BSE Sensex"),
94
+ (r"bse\s*100", "BSE 100"),
95
+ (r"bse\s*500", "BSE 500"),
96
+ ]
97
+
98
+
99
+ def _normalize_category(meta_category: str | None) -> str:
100
+ if not meta_category:
101
+ return "Equity: Index Fund"
102
+ key = meta_category.strip().lower()
103
+ for k, v in CATEGORY_MAP.items():
104
+ if k in key:
105
+ return v
106
+ if "index" in key:
107
+ return "Equity: Index Fund"
108
+ return meta_category.strip()
109
+
110
+
111
+ def _infer_benchmark(scheme_name: str) -> str:
112
+ name = (scheme_name or "").lower()
113
+ for pattern, bench in BENCHMARK_INFER:
114
+ if re.search(pattern, name):
115
+ return bench
116
+ return "Nifty 50" # safe default for index funds
117
+
118
+
119
+ def _search_mfapi(query: str, limit: int = 200) -> list[dict]:
120
+ """Return list of {schemeCode, schemeName} from mfapi search."""
121
+ try:
122
+ resp = requests.get(MFAPI_SEARCH, params={"q": query}, timeout=15)
123
+ resp.raise_for_status()
124
+ data = resp.json()
125
+ if isinstance(data, list):
126
+ return data[:limit]
127
+ return []
128
+ except Exception as e:
129
+ print(f" [search] error for '{query}': {e}")
130
+ return []
131
+
132
+
133
+ def _fetch_nav_meta(scheme_code: str) -> dict | None:
134
+ """Fetch NAV endpoint and return meta only (scheme_name, scheme_category)."""
135
+ url = MFAPI_NAV.format(scheme_code=scheme_code)
136
+ try:
137
+ resp = requests.get(url, params={"limit": 1}, timeout=15)
138
+ resp.raise_for_status()
139
+ data = resp.json()
140
+ meta = data.get("meta") or {}
141
+ return {
142
+ "scheme_name": meta.get("scheme_name") or "",
143
+ "scheme_category": meta.get("scheme_category") or "",
144
+ "fund_house": meta.get("fund_house") or "",
145
+ }
146
+ except Exception as e:
147
+ print(f" [nav meta] {scheme_code}: {e}")
148
+ return None
149
+
150
+
151
+ def get_index_funds_via_mftool(verbose: bool = True) -> list[dict]:
152
+ """
153
+ Fetch index funds from the same AMFI API used by mftool (category 5, subCategory 38).
154
+ Returns the same curated list as would appear in the raw CSV under PS β€” not 10k schemes.
155
+ Each item: scheme_name, benchmark_type. Scheme code is left blank; enrichment will resolve.
156
+ """
157
+ out: list[dict] = []
158
+ base_date = datetime.now().date()
159
+ for day_back in range(8): # try up to 8 days back to get a date with data
160
+ d = base_date - timedelta(days=day_back)
161
+ if d.weekday() >= 5: # skip weekend
162
+ continue
163
+ report_date = d.strftime("%d-%b-%Y")
164
+ payload = {
165
+ "maturityType": 1,
166
+ "category": AMFI_CATEGORY_OTHER,
167
+ "subCategory": AMFI_SUBCATEGORY_INDEX_FUNDS,
168
+ "mfid": 0,
169
+ "reportDate": report_date,
170
+ }
171
+ try:
172
+ resp = requests.post(
173
+ AMFI_FUND_PERFORMANCE_URL,
174
+ headers={"User-Agent": "Mozilla/5.0"},
175
+ json=payload,
176
+ timeout=25,
177
+ )
178
+ resp.raise_for_status()
179
+ data = resp.json()
180
+ raw_list = data.get("data") or []
181
+ for item in raw_list:
182
+ name = (item.get("schemeName") or "").strip()
183
+ if not name:
184
+ continue
185
+ # Exclude ETFs so we match raw CSV (Index Fund section has open-ended funds only)
186
+ if " ETF" in name or name.endswith(" ETF"):
187
+ continue
188
+ benchmark = (item.get("benchmark") or "").strip() or "Nifty 50"
189
+ out.append({
190
+ "scheme_name": name,
191
+ "benchmark_type": benchmark,
192
+ "scheme_code": "", # AMFI API doesn't return code; enrichment resolves
193
+ "category": INDEX_FUND_CATEGORY_PS,
194
+ })
195
+ if out:
196
+ if verbose:
197
+ print(f"[mftool] AMFI category 38 (Index Funds/ETFs): {len(out)} schemes (report date {report_date})")
198
+ break
199
+ except Exception as e:
200
+ if verbose and day_back == 0:
201
+ print(f"[mftool] AMFI request failed for {report_date}: {e}")
202
+ continue
203
+ if not out and verbose:
204
+ print("[mftool] No schemes returned (tried several weekdays). Check AMFI API.")
205
+ return out
206
+
207
+
208
+ def _is_index_scheme(meta_category: str, scheme_name: str) -> bool:
209
+ """True if this scheme should be treated as index fund."""
210
+ cat = (meta_category or "").lower()
211
+ name = (scheme_name or "").lower()
212
+ if "index" in cat:
213
+ return True
214
+ if "index" in name and ("fund" in name or "etf" not in name):
215
+ return True
216
+ # Explicit index benchmarks in name
217
+ if re.search(r"nifty\s*50|nifty\s*next\s*50|sensex|nifty\s*100|nifty\s*500", name):
218
+ return True
219
+ return False
220
+
221
+
222
+ def discover_index_schemes(
223
+ search_queries: list[str] | None = None,
224
+ limit_per_query: int = 150,
225
+ require_index_category: bool = True,
226
+ verbose: bool = True,
227
+ ) -> list[dict]:
228
+ """
229
+ Discover index fund schemes via mfapi search and NAV meta.
230
+
231
+ Returns list of dicts: scheme_code, scheme_name, category, benchmark_type.
232
+ """
233
+ if search_queries is None:
234
+ search_queries = ["Index", "Index Fund", "Nifty 50", "Nifty Next 50", "Sensex"]
235
+ seen_codes: set[int] = set()
236
+ out: list[dict] = []
237
+
238
+ for q in search_queries:
239
+ if verbose:
240
+ print(f"[discover] search q={q!r} …")
241
+ candidates = _search_mfapi(q, limit=limit_per_query)
242
+ for item in candidates:
243
+ code = item.get("schemeCode")
244
+ if code is None or code in seen_codes:
245
+ continue
246
+ name = item.get("schemeName") or ""
247
+ time.sleep(SLEEP)
248
+ meta = _fetch_nav_meta(str(code))
249
+ if not meta:
250
+ continue
251
+ cat = meta.get("scheme_category") or ""
252
+ if require_index_category and not _is_index_scheme(cat, name):
253
+ continue
254
+ seen_codes.add(code)
255
+ category = _normalize_category(cat)
256
+ benchmark = _infer_benchmark(meta.get("scheme_name") or name)
257
+ out.append({
258
+ "scheme_code": str(code),
259
+ "scheme_name": meta.get("scheme_name") or name,
260
+ "category": category,
261
+ "benchmark_type": benchmark,
262
+ })
263
+ if verbose:
264
+ print(f" + {meta.get('scheme_name', name)[:55]} | {category} | {benchmark}")
265
+ return out
266
+
267
+
268
+ def write_fund_csv(rows: list[dict], path: str | Path) -> None:
269
+ """Write CSV with FUND_CSV_HEADERS; each row is a dict with those keys (blank = '')."""
270
+ path = Path(path)
271
+ path.parent.mkdir(parents=True, exist_ok=True)
272
+ with open(path, "w", encoding="utf-8-sig", newline="") as f:
273
+ w = csv.DictWriter(f, fieldnames=FUND_CSV_HEADERS, restval="", extrasaction="ignore")
274
+ w.writeheader()
275
+ w.writerows(rows)
276
+
277
+
278
+ def build_csv_rows(schemes: list[dict], use_ps_format: bool = False) -> list[dict]:
279
+ """Convert discover output to CSV row dicts (metrics blank).
280
+ use_ps_format: when True, Fund = hyphenated name, Category = 'Index Fund' (matches raw CSV under PS).
281
+ """
282
+ rows = []
283
+ for s in schemes:
284
+ row = {h: "" for h in FUND_CSV_HEADERS}
285
+ name = s.get("scheme_name") or ""
286
+ row["Fund"] = _to_hyphenated(name) if use_ps_format else name.replace(",", " ")
287
+ row["Category"] = s.get("category") or ("Index Fund" if use_ps_format else "Equity: Index Fund")
288
+ row["Scheme Code"] = s.get("scheme_code") or ""
289
+ row["Benchmark Type"] = s.get("benchmark_type") or "Nifty 50"
290
+ rows.append(row)
291
+ return rows
292
+
293
+
294
+ def run_ingest(
295
+ output_path: str | Path = "index_funds.csv",
296
+ source: str = "mftool",
297
+ search_queries: list[str] | None = None,
298
+ limit_per_query: int = 150,
299
+ verbose: bool = True,
300
+ ) -> tuple[list[dict], Path]:
301
+ """
302
+ Discover index schemes, build CSV rows, write CSV.
303
+
304
+ source: "mftool" = same as raw CSV (AMFI category 38, curated list). "mfapi" = search mfapi.
305
+ Returns (list of scheme dicts, output path).
306
+ """
307
+ if source.lower() == "mftool":
308
+ schemes = get_index_funds_via_mftool(verbose=verbose)
309
+ use_ps_format = True
310
+ else:
311
+ schemes = discover_index_schemes(
312
+ search_queries=search_queries,
313
+ limit_per_query=limit_per_query,
314
+ require_index_category=True,
315
+ verbose=verbose,
316
+ )
317
+ use_ps_format = False
318
+ rows = build_csv_rows(schemes, use_ps_format=use_ps_format)
319
+ out = Path(output_path)
320
+ write_fund_csv(rows, out)
321
+ if verbose:
322
+ print(f"\n[ingest] Wrote {len(rows)} rows to {out.absolute()} (source={source})")
323
+ print(" Next: run CSV enrichment on this file, then merge into main fund CSV.")
324
+ return schemes, out
325
+
326
+
327
+ def main() -> None:
328
+ ap = argparse.ArgumentParser(
329
+ description="Index fund ingest β€” same list as raw CSV (mftool/AMFI) or mfapi search"
330
+ )
331
+ ap.add_argument("--output", "-o", default="index_funds.csv", help="Output CSV path")
332
+ ap.add_argument(
333
+ "--source",
334
+ choices=("mftool", "mfapi"),
335
+ default="mftool",
336
+ help="mftool = AMFI category 38 (same as raw CSV under PS). mfapi = search (more schemes).",
337
+ )
338
+ ap.add_argument("--search", "-s", action="append", default=None,
339
+ help="[mfapi only] Search query (repeatable). Default: Index, Index Fund, ...")
340
+ ap.add_argument("--limit", "-n", type=int, default=150,
341
+ help="[mfapi only] Max schemes per search query")
342
+ ap.add_argument("--quiet", "-q", action="store_true", help="Less output")
343
+ args = ap.parse_args()
344
+ run_ingest(
345
+ output_path=args.output,
346
+ source=args.source,
347
+ search_queries=args.search,
348
+ limit_per_query=args.limit,
349
+ verbose=not args.quiet,
350
+ )
351
+
352
+
353
+ if __name__ == "__main__":
354
+ main()
src/models.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Data models for MF Portfolio Analysis Tool.
3
+ """
4
+ from dataclasses import dataclass, field
5
+ from typing import Optional
6
+
7
+
8
+ @dataclass
9
+ class Fund:
10
+ """Represents a single mutual fund scheme from the fund universe CSV."""
11
+ name: str
12
+ category: str
13
+ benchmark: str
14
+
15
+ # Cost
16
+ ter: Optional[float] = None # Total Expense Ratio (%)
17
+ turnover: Optional[float] = None # Portfolio Turnover (%)
18
+
19
+ # Returns
20
+ mean: Optional[float] = None
21
+ cagr_1y: Optional[float] = None
22
+ cagr_1y_cat: Optional[float] = None
23
+ cagr_1y_bm: Optional[float] = None
24
+ cagr_3y: Optional[float] = None
25
+ cagr_3y_cat: Optional[float] = None
26
+ cagr_3y_bm: Optional[float] = None
27
+ cagr_5y: Optional[float] = None
28
+ cagr_5y_cat: Optional[float] = None
29
+ cagr_5y_bm: Optional[float] = None
30
+ cagr_10y: Optional[float] = None
31
+ cagr_10y_cat: Optional[float] = None
32
+ cagr_10y_bm: Optional[float] = None
33
+ cagr_inception: Optional[float] = None
34
+ nav: Optional[float] = None
35
+
36
+ # Valuation
37
+ pe_ratio: Optional[float] = None
38
+ pb_ratio: Optional[float] = None
39
+
40
+ # Risk metrics
41
+ alpha: Optional[float] = None
42
+ beta: Optional[float] = None
43
+ std_dev: Optional[float] = None
44
+ sharpe: Optional[float] = None
45
+ volatility: Optional[float] = None
46
+ sortino: Optional[float] = None
47
+ up_capture: Optional[float] = None
48
+ down_capture: Optional[float] = None
49
+ max_drawdown: Optional[float] = None
50
+ r_squared: Optional[float] = None
51
+ info_ratio: Optional[float] = None
52
+ aum: Optional[float] = None
53
+ fill_status: Optional[str] = None
54
+
55
+ # Scoring (computed)
56
+ score: Optional[float] = None
57
+ rank_in_category: Optional[int] = None
58
+ is_top_quartile: bool = False
59
+ weightage: Optional[int] = None # Number of periods beating benchmark
60
+ order: int = 0 # Preserves original CSV insertion order for sort tiebreaker
61
+
62
+
63
+ @dataclass
64
+ class ClientHolding:
65
+ """Represents a single mutual fund holding in a client's portfolio."""
66
+ scheme_name: str
67
+ current_value: float
68
+ invested_amount: Optional[float] = None
69
+ sip_amount: Optional[float] = None
70
+ sip_frequency: Optional[str] = None # Monthly / Quarterly etc.
71
+
72
+ # Matched fund data
73
+ fund: Optional[Fund] = None
74
+
75
+ # Computed
76
+ allocation_pct: float = 0.0
77
+ xirr: Optional[float] = None
78
+ is_underperforming: bool = False
79
+
80
+ # Advisory
81
+ suggested_fund: Optional[Fund] = None
82
+ switch_reason: Optional[str] = None
83
+
84
+
85
+ @dataclass
86
+ class Client:
87
+ """Client details."""
88
+ name: str
89
+ age: Optional[int] = None
90
+ email: Optional[str] = None
91
+ mobile: Optional[str] = None
92
+ pan: Optional[str] = None
93
+
94
+
95
+ @dataclass
96
+ class Advisor:
97
+ """Financial advisor details."""
98
+ name: str = "RAVICHANDRAN"
99
+ phone: str = "9281364703"
100
+ email: str = "c4c.ravi@gmail.com"
101
+ arn: str = "ARN-243354"
102
+ location: str = "Chennai"
103
+
104
+
105
+ @dataclass
106
+ class PortfolioReport:
107
+ """The complete portfolio analysis report for a client."""
108
+ client: Client
109
+ advisor: Advisor
110
+ holdings: list = field(default_factory=list)
111
+
112
+ # Portfolio-level metrics
113
+ total_current_value: float = 0.0
114
+ total_invested: float = 0.0
115
+ unrealized_gain: float = 0.0
116
+ portfolio_xirr: Optional[float] = None
117
+ sharpe: Optional[float] = None
118
+ alpha: Optional[float] = None
119
+ beta: Optional[float] = None
120
+ std_dev: Optional[float] = None
121
+
122
+ # Exposure warnings
123
+ amc_exposure: dict = field(default_factory=dict) # AMC -> pct
124
+ scheme_exposure: dict = field(default_factory=dict) # scheme -> pct
125
+ exposure_warnings: list = field(default_factory=list) # list of warning strings
126
+
127
+ # Allocation
128
+ market_cap_allocation: dict = field(default_factory=dict) # Large/Mid/Small/Other -> pct
129
+ sector_allocation: dict = field(default_factory=dict) # sector -> pct
130
+
131
+ # Wealth projection
132
+ wealth_projection: dict = field(default_factory=dict) # years -> projected value
src/nav_metrics_engine.py ADDED
@@ -0,0 +1,1005 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import sqlite3
5
+ import threading
6
+ import time
7
+ from dataclasses import dataclass, field
8
+ from datetime import datetime
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+ import numpy as np
13
+ import pandas as pd
14
+ import requests
15
+ import yfinance as yf
16
+
17
+
18
+ TRADING_DAYS = 252
19
+ RF_RATE = 0.06
20
+ TRAILING_YEARS = 3
21
+ NAV_STALE_DAYS = 30
22
+
23
+ # ── Disk cache config ─────────────────────────────────────────────────────────
24
+ # NAV history is refreshed if older than 7 days; benchmark index once a day.
25
+ _CACHE_DB_PATH = Path.home() / ".mf_nav_cache.db"
26
+ _NAV_TTL_SECS = 7 * 86_400 # 7 days
27
+ _BENCH_TTL_SECS = 1 * 86_400 # 1 day
28
+ _DB_LOCK = threading.Lock() # one writer at a time across threads
29
+
30
+
31
+ OUTPUT_METRICS: tuple[str, ...] = (
32
+ "Alpha",
33
+ "Beta",
34
+ "Standard Deviation",
35
+ "Volatility",
36
+ "Mean",
37
+ "Sharpe Ratio",
38
+ "Sortino Ratio",
39
+ "Up Market Capture\nRatio",
40
+ "Down Market Capture\nRatio",
41
+ "Maximum Drawdown",
42
+ "R-Squared",
43
+ "Information Ratio",
44
+ )
45
+
46
+ NAV_ONLY_METRICS: set[str] = {
47
+ "Standard Deviation",
48
+ "Volatility",
49
+ "Mean",
50
+ "Sharpe Ratio",
51
+ "Sortino Ratio",
52
+ "Maximum Drawdown",
53
+ }
54
+
55
+ BENCHMARK_DEPENDENT_METRICS: set[str] = {
56
+ "Alpha",
57
+ "Beta",
58
+ "Up Market Capture\nRatio",
59
+ "Down Market Capture\nRatio",
60
+ "R-Squared",
61
+ "Information Ratio",
62
+ }
63
+
64
+
65
+ # Common Indian benchmark labels -> Yahoo Finance ticker
66
+ # Last verified: March 2026
67
+ # ^NIFTYJR was delisted β€” correct ticker for Nifty Next 50 is now ^NSMIDCP
68
+ BENCHMARK_MAP: dict[str, str] = {
69
+ # ── Nifty broad indices ────────────────────────────────────────────────
70
+ "nifty 50": "^NSEI",
71
+ "nifty50": "^NSEI",
72
+ "nifty 50 tri": "^NSEI",
73
+ "nifty next 50": "^NSMIDCP",
74
+ "nifty next 50 tri": "^NSMIDCP",
75
+ "nifty junior": "^NSMIDCP",
76
+ "nifty 100": "^CNX100",
77
+ "nifty 100 tri": "^CNX100",
78
+ "nifty 100 (tri)": "^CNX100",
79
+ "nifty 200": "^CNX200",
80
+ "nifty 500": "^CRSLDX",
81
+ "nifty 500 tri": "^CRSLDX",
82
+ "nifty 500 (tri)": "^CRSLDX",
83
+ "nifty500": "^CRSLDX",
84
+ "nifty500 multicap 50:25:25 tri": "NIFTY500_MULTICAP_50_25_25.NS",
85
+ "nifty500 multicap 50:25:25 (tri)": "NIFTY500_MULTICAP_50_25_25.NS",
86
+ "nifty 500 multicap 50:25:25 (tri)": "NIFTY500_MULTICAP_50_25_25.NS",
87
+ "nifty500 multicap momentum quality 50 tri": "NIFTY500_MULTICAP_50_25_25.NS",
88
+ # ── Nifty midcap / smallcap ────────────────────────────────────────────
89
+ "nifty midcap 150": "NIFTY_MIDCAP_100.NS",
90
+ "nifty midcap 150 tri": "NIFTY_MIDCAP_100.NS",
91
+ "nifty midcap 150 index (tri)": "NIFTY_MIDCAP_100.NS",
92
+ "nifty midcap 100": "NIFTY_MIDCAP_100.NS",
93
+ "nifty midcap 50": "^NSEMDCP50",
94
+ "nifty midcap": "NIFTY_MIDCAP_100.NS",
95
+ "nifty large midcap 250 tri": "NIFTY_LARGEMIDCAP_250.NS",
96
+ "nifty large midcap 250": "NIFTY_LARGEMIDCAP_250.NS",
97
+ "nifty large - midcap 250 index": "NIFTY_LARGEMIDCAP_250.NS",
98
+ "nifty large - midcap 250": "NIFTY_LARGEMIDCAP_250.NS",
99
+ "nifty smallcap 250": "NIFTYSMLCAP250.NS",
100
+ "nifty smallcap 250 tri": "NIFTYSMLCAP250.NS",
101
+ "nifty small cap 250 (tri)": "NIFTYSMLCAP250.NS",
102
+ "nifty smallcap 100": "^CNXSC",
103
+ "nifty smallcap": "NIFTYSMLCAP250.NS",
104
+ # ── BSE ───────────────────────────────────────────────────────────────
105
+ "sensex": "^BSESN",
106
+ "bse sensex": "^BSESN",
107
+ "bse 100": "^BSE100",
108
+ "bse 200": "^BSE100",
109
+ "bse 500": "^BSE500",
110
+ "s&p bse liquid rate index": "^NSEI", # no direct Yahoo ticker; use Nifty as proxy
111
+ # ── Sector / thematic ─────────────────────────────────────────────────
112
+ "nifty bank": "^NSEBANK",
113
+ "nifty bank tri": "^NSEBANK",
114
+ "nifty bank (tri)": "^NSEBANK",
115
+ "nifty private bank": "NIFTY_PVT_BANK.NS",
116
+ "nifty private bank tri": "NIFTY_PVT_BANK.NS",
117
+ "nifty it": "^CNXIT",
118
+ "nifty it tri": "^CNXIT",
119
+ "nifty financial services": "NIFTY_FIN_SERVICE.NS",
120
+ "nifty financial services tri": "NIFTY_FIN_SERVICE.NS",
121
+ "nifty financial services index (tri)": "NIFTY_FIN_SERVICE.NS",
122
+ "nifty financial services ex-bank tri": "NIFTY_FIN_SERVICE.NS",
123
+ "nifty pharma": "^CNXPHARMA",
124
+ "nifty pharma tri": "^CNXPHARMA",
125
+ "nifty healthcare": "NIFTY_HEALTHCARE.NS",
126
+ "nifty healthcare tri": "NIFTY_HEALTHCARE.NS",
127
+ "nifty healthcare tri.": "NIFTY_HEALTHCARE.NS", # trailing dot variant
128
+ "nifty fmcg": "^CNXFMCG",
129
+ "nifty fmcg tri": "^CNXFMCG",
130
+ "nifty infrastructure": "^CNXINFRA",
131
+ "nifty infrastructure tri": "^CNXINFRA",
132
+ "nifty india consumption": "NIFTY_INDIA_CONSUMPTION.NS",
133
+ "nifty india consumption tri": "NIFTY_INDIA_CONSUMPTION.NS",
134
+ "nifty india consumption index (tri)": "NIFTY_INDIA_CONSUMPTION.NS",
135
+ "nifty india manufacturing tri": "NIFTY_INDIA_MANUFACTURING.NS",
136
+ "nifty india defence tri": "NIFTY_INDIA_DEFENCE.NS",
137
+ "nifty housing tri": "NIFTY_HOUSING.NS",
138
+ "nifty cpse tri": "NIFTY_CPSE.NS",
139
+ "nifty mnc tri": "NIFTY_MNC.NS",
140
+ "nifty commodities tri": "^CNXCMDT",
141
+ "nifty 100 esg tri": "NIFTY100_ESG.NS",
142
+ "nifty 100 low volatility 30 tri": "NIFTY100_LOWVOL30.NS",
143
+ "nifty ipo tri": "NIFTY_IPO.NS",
144
+ # ── Factor / strategy ─────────────────────────────────────────────────
145
+ "nifty 200 momentum 30 tri": "NIFTY200_MOMENTUM_30.NS",
146
+ # ── Debt / liquid / overnight β€” use Nifty 1D rate / GSec proxies ──────
147
+ "nifty 1d rate index": "^NSEI", # overnight / liquid funds; no direct Yahoo
148
+ "nifty 1d rate": "^NSEI",
149
+ "crisil liquid overnight index": "^NSEI",
150
+ "nifty 3 year sdl": "^NSEI",
151
+ "nifty 4-8 yr g-sec index": "^NSEI",
152
+ "nifty composite g-sec index": "^NSEI",
153
+ # ── Hybrid / balanced ─────────────────────────────────────────────────
154
+ # AK = AdvisorKhoj composite benchmarks β€” no direct Yahoo ticker
155
+ # Mapped to closest equity index proxy based on fund category
156
+ "ak hybrid balanced tri": "^NSEI", # Dynamic Asset Allocation β†’ Nifty 50
157
+ "ak hybrid aggressive tri": "^NSEI", # Aggressive Hybrid β†’ Nifty 50
158
+ "ak hybrid conservative tri": "^NSEI", # Conservative Hybrid β†’ Nifty 50
159
+ "ak multi asset allocation tri": "^CRSLDX", # Multi Asset β†’ Nifty 500
160
+ "ak equity savings tri": "^NSEI", # Equity Savings β†’ Nifty 50
161
+ # ── Global ────────────────────────────────────────────────────────────
162
+ "msci acwi tri": "URTH", # iShares MSCI ACWI ETF as proxy
163
+ "s&p global 1200 tri": "URTH",
164
+ "nifty 50 arbitrage index": "^NSEI", # arbitrage funds; Nifty proxy
165
+ }
166
+
167
+
168
+ # ── Cache backend: SQLite (local) or Neon/Postgres (production) ──────────────
169
+ #
170
+ # Your Neon DSN (pooler endpoint β€” correct for serverless/HuggingFace):
171
+ # postgresql://neondb_owner:npg_b0JC5rvQlGPN@ep-damp-river-advc7q1j-pooler.c-2.us-east-1.aws.neon.tech/neondb?sslmode=require&channel_binding=require
172
+ #
173
+ # How to switch backends (zero code change needed):
174
+ #
175
+ # LOCAL TESTING (SQLite, default β€” no setup):
176
+ # β†’ Do NOT set DATABASE_URL in your local .env. Uses ~/.mf_nav_cache.db.
177
+ #
178
+ # NEON / HUGGINGFACE SPACES:
179
+ # β†’ Add to your .env OR HuggingFace Space Secret:
180
+ # DATABASE_URL=postgresql://neondb_owner:npg_b0JC5rvQlGPN@ep-damp-river-advc7q1j-pooler.c-2.us-east-1.aws.neon.tech/neondb?sslmode=require&channel_binding=require
181
+ # β†’ Add to requirements.txt:
182
+ # psycopg2-binary
183
+ # β†’ Done. Code detects DATABASE_URL and uses Neon automatically.
184
+ #
185
+ # WHY POOLER ENDPOINT (not direct):
186
+ # HuggingFace Spaces can spin up many workers concurrently.
187
+ # Pooler endpoint (ep-...-pooler.c-2...) handles connection bursts safely.
188
+ # Direct endpoint (ep-... without -pooler) has a hard cap of ~100 connections.
189
+ #
190
+ # WHY channel_binding=require:
191
+ # Your Neon project enforces channel binding. psycopg2 supports it via libpq >= 14.
192
+ # The param is passed through the DSN string β€” no extra code needed.
193
+ #
194
+ # Table schema (identical for SQLite and Postgres):
195
+ # nav_cache(key TEXT PRIMARY KEY, data TEXT NOT NULL, ts DOUBLE PRECISION NOT NULL)
196
+
197
+ import os as _os
198
+
199
+ _DATABASE_URL = _os.environ.get("DATABASE_URL", "")
200
+ _USE_POSTGRES = bool(_DATABASE_URL)
201
+
202
+
203
+ # ── Thread-local Postgres connection pool ─────────────────────────────────────
204
+ # Opening a new psycopg2 connection per cache query costs ~100-200ms on Neon
205
+ # (TLS handshake + auth). With 12 parallel workers Γ— 2 queries/fund Γ— 478 funds
206
+ # that is ~1000 round-trips. Fix: one persistent connection per thread, reused
207
+ # across all queries that thread handles.
208
+ import threading as _threading
209
+ _tls = _threading.local()
210
+
211
+ def _get_pg_conn():
212
+ """
213
+ Return a thread-local persistent Neon connection, creating one if needed.
214
+ Falls back to a fresh connection if the cached one has gone away.
215
+ """
216
+ import psycopg2 # type: ignore
217
+
218
+ conn = getattr(_tls, "pg_conn", None)
219
+ if conn is not None:
220
+ try:
221
+ # Lightweight liveness check β€” closed flag or dead socket
222
+ if not conn.closed:
223
+ conn.cursor().execute("SELECT 1")
224
+ return conn
225
+ except Exception:
226
+ pass # Connection is dead β€” fall through to re-create
227
+
228
+ conn = psycopg2.connect(
229
+ _DATABASE_URL,
230
+ connect_timeout=10,
231
+ keepalives=1,
232
+ keepalives_idle=30,
233
+ keepalives_interval=10,
234
+ keepalives_count=3,
235
+ )
236
+ _tls.pg_conn = conn
237
+ return conn
238
+
239
+
240
+ def _init_cache_db() -> None:
241
+ """Create cache table if it doesn't exist (idempotent, works for both backends)."""
242
+ if _USE_POSTGRES:
243
+ try:
244
+ conn = _get_pg_conn()
245
+ with conn:
246
+ with conn.cursor() as cur:
247
+ cur.execute("""
248
+ CREATE TABLE IF NOT EXISTS nav_cache (
249
+ key TEXT PRIMARY KEY,
250
+ data TEXT NOT NULL,
251
+ ts DOUBLE PRECISION NOT NULL
252
+ )
253
+ """)
254
+ conn.close()
255
+ except Exception as e:
256
+ print(f"[cache] Postgres init warning: {e}")
257
+ else:
258
+ with _DB_LOCK, sqlite3.connect(_CACHE_DB_PATH) as db:
259
+ db.execute("""
260
+ CREATE TABLE IF NOT EXISTS nav_cache (
261
+ key TEXT PRIMARY KEY,
262
+ data TEXT NOT NULL,
263
+ ts REAL NOT NULL
264
+ )
265
+ """)
266
+ db.commit()
267
+
268
+
269
+ def _cache_get(key: str, ttl: float) -> pd.DataFrame | None:
270
+ """Return cached DataFrame if fresh, else None. Works for SQLite and Neon."""
271
+ # Check bulk preload first β€” zero network cost
272
+ if key in _PRELOAD_CACHE:
273
+ return _PRELOAD_CACHE[key]
274
+ try:
275
+ if _USE_POSTGRES:
276
+ conn = _get_pg_conn()
277
+ with conn.cursor() as cur:
278
+ cur.execute(
279
+ "SELECT data, ts FROM nav_cache WHERE key = %s", (key,)
280
+ )
281
+ row = cur.fetchone()
282
+ # Do NOT close β€” thread-local connection is reused
283
+ else:
284
+ with sqlite3.connect(_CACHE_DB_PATH) as db:
285
+ row = db.execute(
286
+ "SELECT data, ts FROM nav_cache WHERE key = ?", (key,)
287
+ ).fetchone()
288
+
289
+ if row and (time.time() - row[1]) < ttl:
290
+ import io as _sio
291
+ return pd.read_json(_sio.StringIO(row[0]), orient="split")
292
+ except Exception:
293
+ pass
294
+ return None
295
+
296
+
297
+ def _cache_set(key: str, df: pd.DataFrame) -> None:
298
+ """Persist DataFrame. Works for SQLite and Neon. Write failures are non-fatal."""
299
+ try:
300
+ serialised = df.to_json(orient="split", date_format="iso")
301
+ if _USE_POSTGRES:
302
+ conn = _get_pg_conn()
303
+ with conn.cursor() as cur:
304
+ cur.execute("""
305
+ INSERT INTO nav_cache (key, data, ts)
306
+ VALUES (%s, %s, %s)
307
+ ON CONFLICT (key) DO UPDATE
308
+ SET data = EXCLUDED.data,
309
+ ts = EXCLUDED.ts
310
+ """, (key, serialised, time.time()))
311
+ conn.commit()
312
+ # Do NOT close β€” thread-local connection is reused
313
+ else:
314
+ with _DB_LOCK, sqlite3.connect(_CACHE_DB_PATH) as db:
315
+ db.execute(
316
+ "INSERT OR REPLACE INTO nav_cache (key, data, ts) VALUES (?, ?, ?)",
317
+ (key, serialised, time.time()),
318
+ )
319
+ db.commit()
320
+ except Exception:
321
+ pass # cache write failure is non-fatal
322
+
323
+
324
+ # Initialise at import time (fast, idempotent).
325
+ try:
326
+ _init_cache_db()
327
+ except Exception:
328
+ pass
329
+
330
+
331
+ # ── In-process cache (lives for the duration of one run) ─────────────────────
332
+
333
+ @dataclass
334
+ class NavEngineCache:
335
+ """
336
+ Two-level cache:
337
+ L1 β€” in-process dict (zero latency within a run, thread-safe via dict GIL)
338
+ L2 β€” SQLite on disk (persists across runs; TTL-based)
339
+ """
340
+ nav_history: dict[str, pd.DataFrame | None] = field(default_factory=dict)
341
+ benchmark_history: dict[str, pd.DataFrame | None] = field(default_factory=dict)
342
+ _lock: threading.Lock = field(default_factory=threading.Lock, repr=False)
343
+
344
+
345
+ def _normalize_benchmark_name(name: str) -> str:
346
+ return " ".join((name or "").lower().replace("-", " ").replace("_", " ").split())
347
+
348
+
349
+ def resolve_benchmark_ticker(benchmark_type: str) -> str:
350
+ # Guard against corrupt scraper artifacts (Java object toString strings)
351
+ raw = (benchmark_type or "").strip()
352
+ if raw.startswith("com.") or "@" in raw:
353
+ return "^NSEI" # fallback for corrupt benchmark strings
354
+ normalized = _normalize_benchmark_name(raw)
355
+ if not normalized:
356
+ return "^NSEI"
357
+ if normalized in BENCHMARK_MAP:
358
+ ticker = BENCHMARK_MAP[normalized]
359
+ else:
360
+ ticker = "^NSEI"
361
+ for key, t in BENCHMARK_MAP.items():
362
+ if key in normalized:
363
+ ticker = t
364
+ break
365
+
366
+ # Second-level fallback: some NSE index tickers resolve from BENCHMARK_MAP
367
+ # but are not available on yfinance (delisted/unavailable symbols).
368
+ # Map them to the nearest available proxy so _prewarm_benchmarks doesn't fail.
369
+ _YF_UNAVAILABLE: dict[str, str] = {
370
+ "NIFTY_CPSE.NS": "^NSEI", # PSU index β†’ broad market
371
+ "NIFTYSMLCAP250.NS": "^CNXSC", # Smallcap 250 β†’ Smallcap 100
372
+ "NIFTY_IPO.NS": "^NSEI", # IPO index β†’ no yf equivalent
373
+ "NIFTY200_MOMENTUM_30.NS": "^NSEI", # momentum factor β†’ broad market
374
+ "NIFTY_HOUSING.NS": "^NSEI",
375
+ "NIFTY_LARGEMIDCAP_250.NS": "^NSEI",
376
+ "NIFTY_INDIA_CONSUMPTION.NS": "^NSEI",
377
+ "NIFTY_HEALTHCARE.NS": "^NSEI",
378
+ "NIFTY100_ESG.NS": "^NSEI",
379
+ "NIFTY100_LOWVOL30.NS": "^NSEI",
380
+ "NIFTY_MNC.NS": "^NSEI",
381
+ "NIFTY_INDIA_MANUFACTURING.NS": "^NSEI",
382
+ "NIFTY500_MULTICAP_50_25_25.NS": "^NSEI",
383
+ }
384
+ return _YF_UNAVAILABLE.get(ticker, ticker)
385
+
386
+
387
+ def _safe_float(value: Any) -> float | None:
388
+ if value is None:
389
+ return None
390
+ text = str(value).strip().replace(",", "")
391
+ if text in {"", "-", "β€”", "N/A", "N/A*", "na", "nan", "None"}:
392
+ return None
393
+ try:
394
+ return float(text)
395
+ except ValueError:
396
+ return None
397
+
398
+
399
+ def _request_json_with_retries(
400
+ url: str, max_retries: int = 3, timeout: int = 20
401
+ ) -> dict[str, Any] | None:
402
+ for attempt in range(1, max_retries + 1):
403
+ try:
404
+ resp = requests.get(url, timeout=timeout)
405
+ resp.raise_for_status()
406
+ return resp.json()
407
+ except Exception:
408
+ if attempt == max_retries:
409
+ return None
410
+ return None
411
+
412
+
413
+ # ── Bulk preload cache ────────────────────────────────────────────────────────
414
+ # Populated once before parallel workers start. _cache_get checks here first,
415
+ # avoiding per-fund Neon round-trips on warm cache runs.
416
+ _PRELOAD_CACHE: dict[str, "pd.DataFrame"] = {}
417
+
418
+
419
+ def _bulk_preload_cache(scheme_codes: list[str], benchmark_tickers: list[str]) -> None:
420
+ """
421
+ Load ALL nav + benchmark entries from Neon in 2 SQL queries.
422
+ Call once before ThreadPoolExecutor starts β€” cuts Neon queries from ~766 to 2.
423
+ SQLite is local/fast so skipped.
424
+ """
425
+ import io as _sio
426
+ global _PRELOAD_CACHE
427
+
428
+ if not _USE_POSTGRES:
429
+ return
430
+
431
+ nav_keys = [f"nav:{c}" for c in scheme_codes if c]
432
+ bench_keys = [f"bench:{t}" for t in benchmark_tickers if t]
433
+ all_keys = nav_keys + bench_keys
434
+ if not all_keys:
435
+ return
436
+
437
+ try:
438
+ conn = _get_pg_conn()
439
+ now = time.time()
440
+ placeholders = ",".join(["%s"] * len(all_keys))
441
+ with conn.cursor() as cur:
442
+ cur.execute(
443
+ f"SELECT key, data, ts FROM nav_cache WHERE key IN ({placeholders})",
444
+ all_keys,
445
+ )
446
+ rows_fetched = cur.fetchall()
447
+
448
+ loaded_nav = loaded_bench = 0
449
+ for key, data, ts in rows_fetched:
450
+ ttl = _NAV_TTL_SECS if key.startswith("nav:") else _BENCH_TTL_SECS
451
+ if (now - ts) >= ttl:
452
+ continue
453
+ try:
454
+ df = pd.read_json(_sio.StringIO(data), orient="split")
455
+ # Normalise dates β€” JSON round-trip strips tz info
456
+ if "date" in df.columns:
457
+ df["date"] = pd.to_datetime(df["date"]).dt.tz_localize(None).dt.normalize()
458
+ except Exception:
459
+ continue
460
+ _PRELOAD_CACHE[key] = df
461
+ if key.startswith("nav:"):
462
+ loaded_nav += 1
463
+ else:
464
+ loaded_bench += 1
465
+
466
+ print(f"[cache] Bulk preload: {loaded_nav} NAV + {loaded_bench} benchmark entries from Neon")
467
+
468
+ except Exception as e:
469
+ print(f"[cache] Bulk preload failed (falling back to per-query): {e}")
470
+
471
+
472
+ def _prewarm_benchmarks(benchmark_tickers: list[str]) -> None:
473
+ """
474
+ Download all unique benchmark tickers in parallel BEFORE workers start.
475
+ Complexity: O(B) time where B = unique benchmarks (68 in production).
476
+ Each already-cached ticker hits _PRELOAD_CACHE in O(1) β€” zero network.
477
+ Each cold ticker downloads once via yfinance and is stored in Neon + _PRELOAD_CACHE.
478
+ Workers then get O(1) cache hits for all benchmark lookups.
479
+ """
480
+ from concurrent.futures import ThreadPoolExecutor, as_completed
481
+
482
+ unique = list(dict.fromkeys(t for t in benchmark_tickers if t)) # preserve order, dedup
483
+ if not unique:
484
+ return
485
+
486
+ # Filter: only fetch tickers not already in preload cache
487
+ cold = [t for t in unique if f"bench:{t}" not in _PRELOAD_CACHE]
488
+ warm = len(unique) - len(cold)
489
+ if warm:
490
+ print(f"[bench-prewarm] {warm}/{len(unique)} already in cache")
491
+ if not cold:
492
+ return
493
+
494
+ print(f"[bench-prewarm] Downloading {len(cold)} cold benchmark tickers in parallel…")
495
+
496
+ def _fetch_one(ticker: str) -> tuple[str, bool]:
497
+ df = _fetch_benchmark_history(ticker) # handles cache_set + _PRELOAD_CACHE population
498
+ return ticker, df is not None
499
+
500
+ ok = failed = 0
501
+ with ThreadPoolExecutor(max_workers=min(len(cold), 20)) as ex:
502
+ futures = {ex.submit(_fetch_one, t): t for t in cold}
503
+ for fut in as_completed(futures):
504
+ ticker, success = fut.result()
505
+ if success:
506
+ ok += 1
507
+ else:
508
+ failed += 1
509
+ print(f" [bench-prewarm] WARN: could not fetch {ticker}")
510
+
511
+ print(f"[bench-prewarm] Done: {ok} fetched, {failed} failed, {warm} from cache")
512
+
513
+
514
+ def _fetch_nav_history(scheme_code: str) -> pd.DataFrame | None:
515
+ """Fetch from Neon cache first, then mfapi."""
516
+ cache_key = f"nav:{scheme_code}"
517
+ cached = _cache_get(cache_key, _NAV_TTL_SECS)
518
+ if cached is not None:
519
+ return cached
520
+
521
+ url = f"https://api.mfapi.in/mf/{scheme_code}"
522
+ payload = _request_json_with_retries(url)
523
+ if not payload or "data" not in payload:
524
+ return None
525
+
526
+ try:
527
+ nav_df = pd.DataFrame(payload["data"])
528
+ if nav_df.empty or "date" not in nav_df or "nav" not in nav_df:
529
+ return None
530
+ nav_df["date"] = pd.to_datetime(nav_df["date"], dayfirst=True, errors="coerce").dt.tz_localize(None).dt.normalize()
531
+ nav_df["nav"] = pd.to_numeric(nav_df["nav"], errors="coerce")
532
+ nav_df = nav_df.dropna(subset=["date", "nav"]).sort_values("date")
533
+ if nav_df.empty:
534
+ return None
535
+ df = nav_df[["date", "nav"]]
536
+ _cache_set(cache_key, df)
537
+ return df
538
+ except Exception:
539
+ return None
540
+
541
+
542
+ def _fetch_benchmark_history(ticker: str) -> pd.DataFrame | None:
543
+ """Fetch from disk cache (L2) first, then yfinance."""
544
+ cache_key = f"bench:{ticker}"
545
+ cached = _cache_get(cache_key, _BENCH_TTL_SECS)
546
+ if cached is not None:
547
+ return cached
548
+
549
+ df = _download_benchmark(ticker)
550
+ if df is not None:
551
+ _cache_set(cache_key, df)
552
+ return df
553
+
554
+
555
+ def _download_benchmark(ticker: str) -> pd.DataFrame | None:
556
+ """
557
+ Raw yfinance download (no caching logic here).
558
+
559
+ Parallel workers hitting yfinance simultaneously can get 401 Invalid Crumb
560
+ errors because yfinance refreshes its session cookie lazily. Fix:
561
+ - Retry up to 4 times with exponential backoff (0.5s, 1s, 2s)
562
+ - Each retry creates a fresh Ticker session, which re-fetches the crumb
563
+ - Suppress noisy 'possibly delisted' stderr from yfinance
564
+ """
565
+ import contextlib, io as _io
566
+
567
+ def _suppress_yf_stderr(fn, *args, **kwargs):
568
+ """Run fn suppressing yfinance's noisy stderr warnings."""
569
+ with contextlib.redirect_stderr(_io.StringIO()):
570
+ return fn(*args, **kwargs)
571
+
572
+ for attempt in range(4):
573
+ if attempt > 0:
574
+ time.sleep(0.5 * (2 ** (attempt - 1))) # 0.5s, 1s, 2s
575
+
576
+ try:
577
+ bench = _suppress_yf_stderr(
578
+ yf.download,
579
+ ticker,
580
+ start="2000-01-01",
581
+ progress=False,
582
+ auto_adjust=False,
583
+ threads=False,
584
+ )
585
+ if bench is None or bench.empty:
586
+ continue
587
+ if isinstance(bench.columns, pd.MultiIndex):
588
+ bench.columns = [str(col[0]) for col in bench.columns]
589
+ bench = bench.reset_index()
590
+ price_col = "Adj Close" if "Adj Close" in bench.columns else "Close"
591
+ if price_col not in bench.columns:
592
+ continue
593
+ bench = bench[["Date", price_col]].rename(
594
+ columns={"Date": "date", price_col: "benchmark"}
595
+ )
596
+ bench["date"] = pd.to_datetime(bench["date"], errors="coerce").dt.tz_localize(None).dt.normalize()
597
+ bench["benchmark"] = pd.to_numeric(bench["benchmark"], errors="coerce")
598
+ bench = bench.dropna(subset=["date", "benchmark"]).sort_values("date")
599
+ if len(bench) >= 60:
600
+ return bench
601
+ except Exception:
602
+ continue
603
+
604
+ # Secondary fallback: Ticker().history() uses a separate session/crumb path
605
+ for attempt in range(3):
606
+ if attempt > 0:
607
+ time.sleep(0.5 * attempt)
608
+ try:
609
+ hist = _suppress_yf_stderr(
610
+ yf.Ticker(ticker).history,
611
+ period="10y",
612
+ auto_adjust=False,
613
+ )
614
+ if hist is None or hist.empty:
615
+ continue
616
+ hist = hist.reset_index()
617
+ price_col = "Adj Close" if "Adj Close" in hist.columns else "Close"
618
+ if price_col not in hist.columns:
619
+ continue
620
+ hist = hist[["Date", price_col]].rename(
621
+ columns={"Date": "date", price_col: "benchmark"}
622
+ )
623
+ hist["date"] = pd.to_datetime(hist["date"], errors="coerce").dt.tz_localize(None).dt.normalize()
624
+ hist["benchmark"] = pd.to_numeric(hist["benchmark"], errors="coerce")
625
+ hist = hist.dropna(subset=["date", "benchmark"]).sort_values("date")
626
+ if len(hist) >= 60:
627
+ return hist
628
+ except Exception:
629
+ continue
630
+
631
+ return None
632
+
633
+
634
+ def _trailing_3y_window(df: pd.DataFrame) -> pd.DataFrame:
635
+ if df.empty:
636
+ return df
637
+ max_date = df["date"].max()
638
+ if pd.isna(max_date):
639
+ return pd.DataFrame(columns=df.columns)
640
+ cutoff = max_date - pd.DateOffset(years=TRAILING_YEARS)
641
+ return df[df["date"] >= cutoff].copy()
642
+
643
+
644
+ def _nav_history_is_stale(nav_df: pd.DataFrame) -> bool:
645
+ if nav_df is None or nav_df.empty or "date" not in nav_df.columns:
646
+ return True
647
+ latest = pd.to_datetime(nav_df["date"], errors="coerce").max()
648
+ if pd.isna(latest):
649
+ return True
650
+ latest = pd.Timestamp(latest).tz_localize(None).normalize()
651
+ cutoff = pd.Timestamp.now().tz_localize(None).normalize() - pd.Timedelta(days=NAV_STALE_DAYS)
652
+ return latest < cutoff
653
+
654
+
655
+ def _compute_nav_only_metrics(
656
+ nav_df: pd.DataFrame,
657
+ needed_metrics: list[str],
658
+ benchmark_reason: str,
659
+ ) -> tuple[dict[str, float | None], dict[str, str]]:
660
+ needed = [m for m in needed_metrics if m in OUTPUT_METRICS]
661
+ out = {m: None for m in needed}
662
+ skip: dict[str, str] = {}
663
+ if not needed:
664
+ return out, skip
665
+
666
+ for m in needed:
667
+ if m in BENCHMARK_DEPENDENT_METRICS:
668
+ skip[m] = benchmark_reason
669
+
670
+ window = _trailing_3y_window(nav_df[["date", "nav"]].copy())
671
+ if window.empty:
672
+ for m in needed:
673
+ if m in NAV_ONLY_METRICS:
674
+ skip[m] = "less than 3 years of NAV history"
675
+ return out, skip
676
+
677
+ returns = window["nav"].pct_change().dropna()
678
+ if len(returns) < 30:
679
+ for m in needed:
680
+ if m in NAV_ONLY_METRICS:
681
+ skip[m] = f"fewer than 30 NAV return points ({len(returns)})"
682
+ return out, skip
683
+
684
+ mean_daily = returns.mean()
685
+ mean_annual = mean_daily * TRADING_DAYS
686
+ vol = returns.std(ddof=1) * np.sqrt(TRADING_DAYS)
687
+
688
+ if pd.notna(vol):
689
+ if "Standard Deviation" in out:
690
+ out["Standard Deviation"] = float(vol * 100)
691
+ if "Volatility" in out:
692
+ out["Volatility"] = float(vol * 100)
693
+ if "Mean" in out and pd.notna(mean_annual):
694
+ out["Mean"] = float(mean_annual * 100)
695
+
696
+ if "Sharpe Ratio" in out:
697
+ if pd.notna(vol) and vol > 0:
698
+ sharpe = (mean_annual - RF_RATE) / vol
699
+ out["Sharpe Ratio"] = float(sharpe) if pd.notna(sharpe) else None
700
+ if out["Sharpe Ratio"] is None:
701
+ skip["Sharpe Ratio"] = "volatility is zero or NaN (NAV-only fallback)"
702
+
703
+ if "Sortino Ratio" in out:
704
+ downside = returns[returns < 0]
705
+ if not downside.empty:
706
+ downside_std = downside.std(ddof=1) * np.sqrt(TRADING_DAYS)
707
+ if pd.notna(downside_std) and downside_std > 0:
708
+ sortino = (mean_annual - RF_RATE) / downside_std
709
+ out["Sortino Ratio"] = float(sortino) if pd.notna(sortino) else None
710
+ elif out.get("Sharpe Ratio") is not None:
711
+ out["Sortino Ratio"] = float(out["Sharpe Ratio"])
712
+ elif out.get("Sharpe Ratio") is not None:
713
+ out["Sortino Ratio"] = float(out["Sharpe Ratio"])
714
+ if out["Sortino Ratio"] is None:
715
+ skip["Sortino Ratio"] = "no valid downside deviation (NAV-only fallback)"
716
+
717
+ if "Maximum Drawdown" in out:
718
+ cumulative = (1 + returns).cumprod()
719
+ peak = cumulative.cummax()
720
+ drawdown = (cumulative - peak) / peak
721
+ if not drawdown.empty:
722
+ max_drawdown = drawdown.min()
723
+ out["Maximum Drawdown"] = (
724
+ float(max_drawdown * 100) if pd.notna(max_drawdown) else None
725
+ )
726
+ if out["Maximum Drawdown"] is None:
727
+ skip["Maximum Drawdown"] = "unable to compute NAV-only drawdown"
728
+
729
+ return out, skip
730
+
731
+
732
+ def _compute_metrics(
733
+ returns_df: pd.DataFrame,
734
+ ) -> tuple[dict[str, float | None], dict[str, str]]:
735
+ skip: dict[str, str] = {}
736
+
737
+ if returns_df.empty:
738
+ for k in OUTPUT_METRICS:
739
+ skip[k] = "empty returns dataframe after merge/window"
740
+ return {k: None for k in OUTPUT_METRICS}, skip
741
+
742
+ fund = returns_df["fund_return"]
743
+ bench = returns_df["benchmark_return"]
744
+ result: dict[str, float | None] = {k: None for k in OUTPUT_METRICS}
745
+
746
+ if len(fund) < 30:
747
+ for k in OUTPUT_METRICS:
748
+ skip[k] = f"fewer than 30 data points ({len(fund)}) after join"
749
+ return result, skip
750
+
751
+ mean_daily = fund.mean()
752
+ bench_mean_daily = bench.mean()
753
+ mean_annual = mean_daily * TRADING_DAYS
754
+ bench_annual = bench_mean_daily * TRADING_DAYS
755
+
756
+ vol = fund.std(ddof=1) * np.sqrt(TRADING_DAYS)
757
+ if pd.notna(vol):
758
+ result["Standard Deviation"] = float(vol * 100)
759
+ result["Volatility"] = float(vol * 100)
760
+ result["Mean"] = float(mean_annual * 100) if pd.notna(mean_annual) else None
761
+
762
+ bench_var = bench.var(ddof=1)
763
+ beta = None
764
+ if pd.notna(bench_var) and bench_var and bench_var > 0:
765
+ cov = np.cov(fund, bench)[0, 1]
766
+ beta = cov / bench_var
767
+ result["Beta"] = float(beta) if beta is not None and pd.notna(beta) else None
768
+ if result["Beta"] is None:
769
+ skip["Beta"] = (
770
+ "benchmark variance is zero or NaN"
771
+ if not (pd.notna(bench_var) and bench_var and bench_var > 0)
772
+ else "beta computation returned NaN"
773
+ )
774
+
775
+ if beta is not None and pd.notna(mean_annual):
776
+ alpha = mean_annual - (RF_RATE + beta * (bench_annual - RF_RATE))
777
+ result["Alpha"] = float(alpha * 100) if pd.notna(alpha) else None
778
+ if result["Alpha"] is None:
779
+ skip["Alpha"] = (
780
+ "Beta is None β€” Alpha requires Beta"
781
+ if result["Beta"] is None
782
+ else "Alpha computation returned NaN"
783
+ )
784
+
785
+ if vol and vol > 0:
786
+ sharpe = (mean_annual - RF_RATE) / vol
787
+ result["Sharpe Ratio"] = float(sharpe) if pd.notna(sharpe) else None
788
+ if result["Sharpe Ratio"] is None:
789
+ skip["Sharpe Ratio"] = "volatility is zero or NaN"
790
+
791
+ downside = fund[fund < 0]
792
+ if not downside.empty:
793
+ downside_std = downside.std(ddof=1) * np.sqrt(TRADING_DAYS)
794
+ if pd.notna(downside_std) and downside_std > 0:
795
+ sortino = (mean_annual - RF_RATE) / downside_std
796
+ result["Sortino Ratio"] = float(sortino) if pd.notna(sortino) else None
797
+ elif result["Sharpe Ratio"] is not None:
798
+ result["Sortino Ratio"] = float(result["Sharpe Ratio"])
799
+ else:
800
+ skip["Sortino Ratio"] = "downside std dev is zero and Sharpe fallback unavailable"
801
+ elif result["Sharpe Ratio"] is not None:
802
+ result["Sortino Ratio"] = float(result["Sharpe Ratio"])
803
+ else:
804
+ skip["Sortino Ratio"] = (
805
+ "no negative daily returns in 3Y window and Sharpe fallback unavailable"
806
+ )
807
+
808
+ cumulative = (1 + fund).cumprod()
809
+ peak = cumulative.cummax()
810
+ drawdown = (cumulative - peak) / peak
811
+ if not drawdown.empty:
812
+ max_drawdown = drawdown.min()
813
+ result["Maximum Drawdown"] = (
814
+ float(max_drawdown * 100) if pd.notna(max_drawdown) else None
815
+ )
816
+
817
+ corr = fund.corr(bench)
818
+ if pd.notna(corr):
819
+ result["R-Squared"] = float(corr ** 2)
820
+ else:
821
+ skip["R-Squared"] = "fund/benchmark correlation is NaN"
822
+
823
+ active = fund - bench
824
+ tracking_error = active.std(ddof=1) * np.sqrt(TRADING_DAYS)
825
+ if pd.notna(tracking_error) and tracking_error > 0:
826
+ info_ratio = (mean_annual - bench_annual) / tracking_error
827
+ result["Information Ratio"] = (
828
+ float(info_ratio) if pd.notna(info_ratio) else None
829
+ )
830
+ else:
831
+ skip["Information Ratio"] = (
832
+ "tracking error is zero β€” fund mirrors benchmark"
833
+ if (pd.notna(tracking_error) and tracking_error == 0)
834
+ else "tracking error is NaN"
835
+ )
836
+
837
+ up = returns_df[returns_df["benchmark_return"] > 0]
838
+ if not up.empty:
839
+ up_bench = up["benchmark_return"].mean()
840
+ if pd.notna(up_bench) and up_bench != 0:
841
+ up_capture = (up["fund_return"].mean() / up_bench) * 100
842
+ result["Up Market Capture\nRatio"] = (
843
+ float(up_capture) if pd.notna(up_capture) else None
844
+ )
845
+ else:
846
+ skip["Up Market Capture\nRatio"] = "benchmark mean on up-days is zero or NaN"
847
+ else:
848
+ skip["Up Market Capture\nRatio"] = "no benchmark up-days in 3Y window"
849
+
850
+ down = returns_df[returns_df["benchmark_return"] < 0]
851
+ if not down.empty:
852
+ down_bench = down["benchmark_return"].mean()
853
+ if pd.notna(down_bench) and down_bench != 0:
854
+ down_capture = (down["fund_return"].mean() / down_bench) * 100
855
+ result["Down Market Capture\nRatio"] = (
856
+ float(down_capture) if pd.notna(down_capture) else None
857
+ )
858
+ else:
859
+ skip["Down Market Capture\nRatio"] = "benchmark mean on down-days is zero or NaN"
860
+ else:
861
+ skip["Down Market Capture\nRatio"] = "no benchmark down-days in 3Y window"
862
+
863
+ return result, skip
864
+
865
+
866
+ def compute_nav_metrics_for_scheme(
867
+ *,
868
+ scheme_code: str,
869
+ benchmark_type: str,
870
+ needed_metrics: list[str],
871
+ cache: NavEngineCache,
872
+ ) -> tuple[dict[str, float | None], dict[str, str]]:
873
+ """
874
+ Compute trailing-3Y risk metrics for a scheme.
875
+
876
+ Thread-safe: uses NavEngineCache._lock to serialise L1 dict writes so
877
+ concurrent ThreadPoolExecutor workers don't race on the same key.
878
+ """
879
+ needed = [m for m in needed_metrics if m in OUTPUT_METRICS]
880
+ if not needed:
881
+ return {}, {}
882
+
883
+ code = str(scheme_code or "").strip()
884
+ if not code:
885
+ reason = "no scheme code β€” category header or unresolved scheme"
886
+ return {m: None for m in needed}, {m: reason for m in needed}
887
+
888
+ # ── NAV history (L1 check then L2 fetch) ──────────────────────────────
889
+ with cache._lock:
890
+ if code not in cache.nav_history:
891
+ cache.nav_history[code] = None # sentinel prevents duplicate fetches
892
+ nav_df = cache.nav_history.get(code)
893
+ if nav_df is None and cache.nav_history.get(code) is None:
894
+ fetched = _fetch_nav_history(code)
895
+ with cache._lock:
896
+ cache.nav_history[code] = fetched
897
+ nav_df = fetched
898
+ elif nav_df is None:
899
+ nav_df = _fetch_nav_history(code)
900
+ with cache._lock:
901
+ cache.nav_history[code] = nav_df
902
+
903
+ if nav_df is None or nav_df.empty:
904
+ reason = f"MFAPI returned no NAV history for scheme code {code}"
905
+ return {m: None for m in needed}, {m: reason for m in needed}
906
+ if _nav_history_is_stale(nav_df):
907
+ latest = pd.to_datetime(nav_df["date"], errors="coerce").max()
908
+ latest_str = (
909
+ pd.Timestamp(latest).tz_localize(None).normalize().strftime("%Y-%m-%d")
910
+ if pd.notna(latest) else "unknown"
911
+ )
912
+ reason = f"NAV history is stale for scheme code {code} (latest NAV {latest_str})"
913
+ return {m: None for m in needed}, {m: reason for m in needed}
914
+
915
+ # ── Benchmark history (L1 check then L2 fetch) ────────────────────────
916
+ ticker = resolve_benchmark_ticker(benchmark_type)
917
+
918
+ def _ensure_benchmark(t: str) -> pd.DataFrame | None:
919
+ with cache._lock:
920
+ if t not in cache.benchmark_history:
921
+ cache.benchmark_history[t] = None
922
+ bench = cache.benchmark_history.get(t)
923
+ if bench is None:
924
+ fetched_b = _fetch_benchmark_history(t)
925
+ with cache._lock:
926
+ cache.benchmark_history[t] = fetched_b
927
+ return fetched_b
928
+ return bench
929
+
930
+ bench_df = _ensure_benchmark(ticker)
931
+ if (bench_df is None or bench_df.empty or len(bench_df) < 60) and ticker != "^NSEI":
932
+ bench_df = _ensure_benchmark("^NSEI")
933
+ if bench_df is None or bench_df.empty:
934
+ reason = f"benchmark history unavailable for ticker={ticker} and NIFTY 50 fallback also failed"
935
+ return _compute_nav_only_metrics(nav_df, needed, reason)
936
+
937
+ # ── Merge + compute ───────────────────────────────────────────────────
938
+ # Strip tz from both sides β€” yfinance returns UTC-aware, JSON cache is naive
939
+ nav_df = nav_df.copy()
940
+ bench_df = bench_df.copy()
941
+ nav_df["date"] = pd.to_datetime(nav_df["date"]).dt.tz_localize(None).dt.normalize()
942
+ bench_df["date"] = pd.to_datetime(bench_df["date"]).dt.tz_localize(None).dt.normalize()
943
+
944
+ # Debt funds (Liquid, Overnight, Ultra Short etc.) publish NAV every calendar
945
+ # day including weekends/holidays, while equity benchmarks only publish on
946
+ # trading days. A naive inner-join on date yields almost no matching rows
947
+ # (<30) causing all metrics to return None.
948
+ # Fix: forward-fill NAV to the benchmark's trading-day calendar so the merge
949
+ # always produces a full 3Y of matched rows regardless of fund type.
950
+ bench_dates = bench_df[["date"]].drop_duplicates().sort_values("date")
951
+ nav_reindexed = (
952
+ nav_df.set_index("date")
953
+ .reindex(bench_dates["date"])
954
+ .ffill() # carry last known NAV forward
955
+ .dropna()
956
+ .reset_index()
957
+ .rename(columns={"index": "date"})
958
+ )
959
+ merged = pd.merge(nav_reindexed, bench_df, on="date", how="inner")
960
+ if merged.empty:
961
+ reason = f"no overlapping dates between NAV (scheme={code}) and benchmark (ticker={ticker})"
962
+ return _compute_nav_only_metrics(nav_df, needed, reason)
963
+
964
+
965
+ merged = _trailing_3y_window(merged)
966
+ if merged.empty:
967
+ reason = f"less than 3 years of overlapping data for scheme={code}"
968
+ return {m: None for m in needed}, {m: reason for m in needed}
969
+
970
+ merged["fund_return"] = merged["nav"].pct_change()
971
+ merged["benchmark_return"] = merged["benchmark"].pct_change()
972
+ merged = merged.dropna(subset=["fund_return", "benchmark_return"]).copy()
973
+ if merged.empty:
974
+ reason = "all rows dropped after computing benchmark-joined returns"
975
+ return _compute_nav_only_metrics(nav_df, needed, reason)
976
+
977
+ all_metrics, all_skip = _compute_metrics(merged)
978
+ metrics = {m: all_metrics.get(m) for m in needed}
979
+ skip_reasons = {
980
+ m: all_skip[m]
981
+ for m in needed
982
+ if m in all_skip and metrics.get(m) is None
983
+ }
984
+
985
+ # Defensive top-up for NAV-only metrics
986
+ if any(m in NAV_ONLY_METRICS and metrics.get(m) is None for m in needed):
987
+ nav_only, nav_only_skip = _compute_nav_only_metrics(
988
+ nav_df, needed, "benchmark-dependent metric unavailable"
989
+ )
990
+ for m in needed:
991
+ if (
992
+ m in NAV_ONLY_METRICS
993
+ and metrics.get(m) is None
994
+ and nav_only.get(m) is not None
995
+ ):
996
+ metrics[m] = nav_only[m]
997
+ skip_reasons.pop(m, None)
998
+ elif (
999
+ metrics.get(m) is None
1000
+ and m not in skip_reasons
1001
+ and m in nav_only_skip
1002
+ ):
1003
+ skip_reasons[m] = nav_only_skip[m]
1004
+
1005
+ return metrics, skip_reasons
src/pdf_generator.py ADDED
@@ -0,0 +1,560 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PDF Generator: Produces the investor portfolio review PDF.
3
+
4
+ Layout (matching the sample investor-portfolio-review PDF):
5
+ Page 1:
6
+ - Header (Advisor + Client details)
7
+ - Executive Summary (total value, gain, metrics)
8
+ - Holdings table (all schemes with score)
9
+ - Market Cap Allocation pie
10
+ - Sector Allocation bar
11
+
12
+ Page 2+:
13
+ - Per-scheme detail block (fund metrics vs top quartile vs benchmark)
14
+ - Underperforming flags
15
+ - Switch suggestion (if any)
16
+ - Capital gains estimate (if switch suggested)
17
+
18
+ Final Page:
19
+ - Wealth Projection chart
20
+ - Disclaimer
21
+ """
22
+
23
+ import io
24
+ import os
25
+ from pathlib import Path
26
+ from datetime import datetime
27
+ from typing import Optional, List
28
+ from reportlab.lib.pagesizes import A4
29
+ from reportlab.lib import colors
30
+ from reportlab.lib.units import mm
31
+ from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
32
+ from reportlab.lib.enums import TA_LEFT, TA_CENTER, TA_RIGHT
33
+ from reportlab.platypus import (
34
+ SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle,
35
+ HRFlowable, PageBreak, Image, KeepTogether
36
+ )
37
+ from reportlab.platypus.flowables import Flowable
38
+ from reportlab.graphics.shapes import Drawing, Rect, String
39
+ import matplotlib
40
+ matplotlib.use('Agg')
41
+
42
+ from src.models import PortfolioReport, ClientHolding, Fund
43
+ from src import charts as ch
44
+
45
+
46
+ # ─── Theme ───────────────────────────────────────────────────────────────────
47
+
48
+ DARK_BLUE = colors.HexColor("#1F3864")
49
+ MID_BLUE = colors.HexColor("#2E75B6")
50
+ LIGHT_BLUE = colors.HexColor("#BDD7EE")
51
+ GREEN = colors.HexColor("#2ECC71")
52
+ ORANGE = colors.HexColor("#E67E22")
53
+ RED = colors.HexColor("#E74C3C")
54
+ GREY_BG = colors.HexColor("#F5F5F5")
55
+ LIGHT_GREY = colors.HexColor("#D9D9D9")
56
+ WHITE = colors.white
57
+ BLACK = colors.black
58
+
59
+ W, H = A4
60
+ MARGIN = 15 * mm
61
+
62
+ styles = getSampleStyleSheet()
63
+
64
+ def S(name, **kwargs):
65
+ """Quick style builder."""
66
+ return ParagraphStyle(name, parent=styles['Normal'], **kwargs)
67
+
68
+
69
+ # ─── Style Definitions ────────────────────────────────────────────────────────
70
+
71
+ STYLE_TITLE = S("Title", fontSize=18, textColor=DARK_BLUE, fontName="Helvetica-Bold",
72
+ spaceAfter=2, alignment=TA_CENTER)
73
+ STYLE_SUBTITLE = S("Subtitle", fontSize=9, textColor=MID_BLUE, fontName="Helvetica",
74
+ spaceAfter=4, alignment=TA_CENTER)
75
+ STYLE_H1 = S("H1", fontSize=11, textColor=DARK_BLUE, fontName="Helvetica-Bold",
76
+ spaceAfter=3, spaceBefore=6)
77
+ STYLE_H2 = S("H2", fontSize=9, textColor=DARK_BLUE, fontName="Helvetica-Bold",
78
+ spaceAfter=2, spaceBefore=4)
79
+ STYLE_BODY = S("Body", fontSize=8, textColor=BLACK,
80
+ spaceAfter=2)
81
+ STYLE_SMALL = S("Small", fontSize=7, textColor=colors.HexColor("#555555"),
82
+ spaceAfter=1)
83
+ STYLE_WARN = S("Warn", fontSize=8, textColor=colors.HexColor("#C0392B"),
84
+ fontName="Helvetica-Bold")
85
+ STYLE_OK = S("OK", fontSize=8, textColor=colors.HexColor("#27AE60"),
86
+ fontName="Helvetica-Bold")
87
+ STYLE_DISCLAIMER= S("Disc", fontSize=6, textColor=colors.HexColor("#666666"),
88
+ spaceAfter=2, leading=8)
89
+
90
+
91
+ def _fmt_inr(value: float) -> str:
92
+ """Format as Indian currency string."""
93
+ if value is None:
94
+ return "N/A"
95
+ if abs(value) >= 1e7:
96
+ return f"β‚Ή{value/1e7:.2f} Cr"
97
+ if abs(value) >= 1e5:
98
+ return f"β‚Ή{value/1e5:.2f} L"
99
+ return f"β‚Ή{value:,.0f}"
100
+
101
+
102
+ def _fmt_pct(value: Optional[float], decimals: int = 2) -> str:
103
+ if value is None:
104
+ return "N/A"
105
+ return f"{value:.{decimals}f}%"
106
+
107
+
108
+ def _fmt_float(value: Optional[float], decimals: int = 2) -> str:
109
+ if value is None:
110
+ return "N/A"
111
+ return f"{value:.{decimals}f}"
112
+
113
+
114
+ def _img_from_buf(buf: io.BytesIO, width_mm: float, height_mm: float) -> Image:
115
+ """Create a ReportLab Image from a BytesIO buffer."""
116
+ img = Image(buf)
117
+ img.drawWidth = width_mm * mm
118
+ img.drawHeight = height_mm * mm
119
+ return img
120
+
121
+
122
+ def _table_style(header_color=DARK_BLUE, row_alt=GREY_BG):
123
+ return TableStyle([
124
+ ('BACKGROUND', (0, 0), (-1, 0), header_color),
125
+ ('TEXTCOLOR', (0, 0), (-1, 0), WHITE),
126
+ ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
127
+ ('FONTSIZE', (0, 0), (-1, 0), 7),
128
+ ('ALIGN', (0, 0), (-1, 0), 'CENTER'),
129
+ ('ROWBACKGROUNDS', (0, 1), (-1, -1), [WHITE, row_alt]),
130
+ ('FONTNAME', (0, 1), (-1, -1), 'Helvetica'),
131
+ ('FONTSIZE', (0, 1), (-1, -1), 7),
132
+ ('ALIGN', (1, 1), (-1, -1), 'RIGHT'),
133
+ ('ALIGN', (0, 1), (0, -1), 'LEFT'),
134
+ ('GRID', (0, 0), (-1, -1), 0.3, LIGHT_GREY),
135
+ ('TOPPADDING', (0, 0), (-1, -1), 3),
136
+ ('BOTTOMPADDING', (0, 0), (-1, -1), 3),
137
+ ('LEFTPADDING', (0, 0), (-1, -1), 4),
138
+ ('RIGHTPADDING', (0, 0), (-1, -1), 4),
139
+ ])
140
+
141
+
142
+ # ─── Section Builders ────────────────────────────────────────────────────────
143
+
144
+ def _build_header(report: PortfolioReport) -> List:
145
+ """Build the header section: advisor brand + client info."""
146
+ adv = report.advisor
147
+ cli = report.client
148
+ today = datetime.now().strftime("%d %B %Y")
149
+
150
+ elements = []
151
+
152
+ # Top bar (advisor on left, date on right)
153
+ header_data = [[
154
+ Paragraph(f"<b>{adv.name}</b><br/>"
155
+ f"<font size='8' color='#2E75B6'>{adv.location} | {adv.phone} | {adv.email}</font><br/>"
156
+ f"<font size='7' color='#888888'>{adv.arn} | AMFI Registered Mutual Fund Distributor</font>",
157
+ S("adv", fontName='Helvetica-Bold', fontSize=10, textColor=DARK_BLUE)),
158
+ Paragraph(f"<para align='right'><font size='8' color='#888888'>"
159
+ f"Date: {today}</font></para>",
160
+ STYLE_SMALL),
161
+ ]]
162
+ header_table = Table(header_data, colWidths=[120*mm, 60*mm])
163
+ header_table.setStyle(TableStyle([
164
+ ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
165
+ ('ALIGN', (1, 0), (1, 0), 'RIGHT'),
166
+ ]))
167
+ elements.append(header_table)
168
+ elements.append(HRFlowable(width="100%", thickness=2, color=MID_BLUE, spaceAfter=4))
169
+
170
+ # Report title
171
+ elements.append(Paragraph("Investor Portfolio Review", STYLE_TITLE))
172
+ elements.append(Paragraph("Confidential | Prepared exclusively for the client", STYLE_SUBTITLE))
173
+ elements.append(HRFlowable(width="100%", thickness=0.5, color=LIGHT_GREY, spaceAfter=6))
174
+
175
+ # Client info box
176
+ client_info = [
177
+ ["Client Name", cli.name, "Age", str(cli.age or "N/A")],
178
+ ["Mobile", cli.mobile or "N/A", "Email", cli.email or "N/A"],
179
+ ["PAN", cli.pan or "N/A", "", ""],
180
+ ]
181
+ ct = Table(client_info, colWidths=[30*mm, 55*mm, 25*mm, 70*mm])
182
+ ct.setStyle(TableStyle([
183
+ ('FONTNAME', (0, 0), (0, -1), 'Helvetica-Bold'),
184
+ ('FONTNAME', (2, 0), (2, -1), 'Helvetica-Bold'),
185
+ ('FONTSIZE', (0, 0), (-1, -1), 8),
186
+ ('TEXTCOLOR', (0, 0), (0, -1), DARK_BLUE),
187
+ ('TEXTCOLOR', (2, 0), (2, -1), DARK_BLUE),
188
+ ('ROWBACKGROUNDS', (0, 0), (-1, -1), [GREY_BG, WHITE]),
189
+ ('TOPPADDING', (0, 0), (-1, -1), 3),
190
+ ('BOTTOMPADDING', (0, 0), (-1, -1), 3),
191
+ ('LEFTPADDING', (0, 0), (-1, -1), 4),
192
+ ('GRID', (0, 0), (-1, -1), 0.3, LIGHT_GREY),
193
+ ]))
194
+ elements.append(ct)
195
+ elements.append(Spacer(1, 4*mm))
196
+ return elements
197
+
198
+
199
+ def _build_summary(report: PortfolioReport) -> List:
200
+ """Portfolio snapshot summary cards."""
201
+ elements = [Paragraph("πŸ“Š Portfolio Snapshot", STYLE_H1)]
202
+
203
+ gain = report.unrealized_gain
204
+ gain_color = "#27AE60" if gain >= 0 else "#E74C3C"
205
+ gain_sign = "+" if gain >= 0 else ""
206
+
207
+ summary_data = [
208
+ ["Current Value", "Total Invested", "Unrealised Gain", "Sharpe Ratio", "Alpha", "Beta"],
209
+ [
210
+ _fmt_inr(report.total_current_value),
211
+ _fmt_inr(report.total_invested),
212
+ f"<font color='{gain_color}'>{gain_sign}{_fmt_inr(gain)}</font>",
213
+ _fmt_float(report.sharpe),
214
+ _fmt_pct(report.alpha),
215
+ _fmt_float(report.beta),
216
+ ],
217
+ ]
218
+
219
+ def para_cells(row):
220
+ return [Paragraph(str(c), S("sc", fontSize=8, fontName='Helvetica-Bold' if i < 1 else 'Helvetica',
221
+ alignment=TA_CENTER, textColor=DARK_BLUE))
222
+ for i, c in enumerate(row)]
223
+
224
+ tbl = Table(
225
+ [para_cells(summary_data[0]), para_cells(summary_data[1])],
226
+ colWidths=[30*mm] * 6
227
+ )
228
+ tbl.setStyle(TableStyle([
229
+ ('BACKGROUND', (0, 0), (-1, 0), DARK_BLUE),
230
+ ('TEXTCOLOR', (0, 0), (-1, 0), WHITE),
231
+ ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
232
+ ('FONTSIZE', (0, 0), (-1, -1), 8),
233
+ ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
234
+ ('ROWBACKGROUNDS', (0, 1), (-1, -1), [LIGHT_BLUE]),
235
+ ('GRID', (0, 0), (-1, -1), 0.3, LIGHT_GREY),
236
+ ('TOPPADDING', (0, 0), (-1, -1), 5),
237
+ ('BOTTOMPADDING', (0, 0), (-1, -1), 5),
238
+ ]))
239
+ elements.append(tbl)
240
+ elements.append(Spacer(1, 4*mm))
241
+ return elements
242
+
243
+
244
+ def _build_holdings_table(report: PortfolioReport) -> List:
245
+ """Main holdings table."""
246
+ elements = [Paragraph("πŸ“‹ Existing Portfolio Holdings", STYLE_H1)]
247
+
248
+ rows = [["#", "Scheme Name", "Current Value", "Allocation", "Score", "Status"]]
249
+ for i, h in enumerate(report.holdings, 1):
250
+ score = _fmt_float(h.fund.score) if h.fund and h.fund.score else "N/A"
251
+ status = "⚠️ Underperforms" if h.is_underperforming else "βœ… On Track"
252
+ rows.append([
253
+ str(i),
254
+ h.scheme_name[:45],
255
+ _fmt_inr(h.current_value),
256
+ _fmt_pct(h.allocation_pct),
257
+ score,
258
+ status,
259
+ ])
260
+ rows.append(["", "<b>TOTAL</b>", _fmt_inr(report.total_current_value), "100%", "", ""])
261
+
262
+ tbl = Table(rows, colWidths=[8*mm, 80*mm, 28*mm, 18*mm, 14*mm, 32*mm])
263
+ style = _table_style()
264
+
265
+ # Red for underperformers, green for on-track (in status column)
266
+ for i, h in enumerate(report.holdings, 1):
267
+ if h.is_underperforming:
268
+ style.add('TEXTCOLOR', (5, i), (5, i), RED)
269
+ else:
270
+ style.add('TEXTCOLOR', (5, i), (5, i), GREEN)
271
+
272
+ # Bold total row
273
+ style.add('FONTNAME', (0, len(rows)-1), (-1, len(rows)-1), 'Helvetica-Bold')
274
+ style.add('BACKGROUND', (0, len(rows)-1), (-1, len(rows)-1), LIGHT_BLUE)
275
+
276
+ tbl.setStyle(style)
277
+ elements.append(tbl)
278
+ elements.append(Spacer(1, 3*mm))
279
+
280
+ # Exposure warnings
281
+ if report.exposure_warnings:
282
+ elements.append(Paragraph("⚠️ Exposure Alerts", STYLE_H2))
283
+ for warn in report.exposure_warnings:
284
+ elements.append(Paragraph(warn, STYLE_WARN))
285
+ elements.append(Spacer(1, 2*mm))
286
+
287
+ return elements
288
+
289
+
290
+ def _build_allocation_charts(report: PortfolioReport) -> List:
291
+ """Market cap + sector charts side by side."""
292
+ elements = [Paragraph("πŸ“ˆ Portfolio Allocation Analysis", STYLE_H1)]
293
+
294
+ # Holdings pie
295
+ holdings_data = {h.scheme_name: h.current_value for h in report.holdings}
296
+ pie_buf = ch.holdings_pie_chart(holdings_data, "Fund-wise Allocation")
297
+
298
+ # Market cap pie (use dummy data if not available)
299
+ mc_data = report.market_cap_allocation or {
300
+ "Large Cap": 10, "Mid Cap": 45, "Small Cap": 40, "Others": 5
301
+ }
302
+ mc_buf = ch.market_cap_pie(mc_data)
303
+
304
+ chart_table = Table(
305
+ [[_img_from_buf(pie_buf, 85, 70), _img_from_buf(mc_buf, 80, 70)]],
306
+ colWidths=[90*mm, 90*mm]
307
+ )
308
+ chart_table.setStyle(TableStyle([('VALIGN', (0, 0), (-1, -1), 'TOP')]))
309
+ elements.append(chart_table)
310
+ elements.append(Spacer(1, 3*mm))
311
+
312
+ # Sector chart
313
+ if report.sector_allocation:
314
+ sec_buf = ch.sector_bar_chart(report.sector_allocation)
315
+ elements.append(Paragraph("🏭 Sector Allocation", STYLE_H2))
316
+ elements.append(_img_from_buf(sec_buf, 170, 65))
317
+ elements.append(Spacer(1, 3*mm))
318
+
319
+ return elements
320
+
321
+
322
+ def _build_scheme_details(report: PortfolioReport) -> List:
323
+ """Per-scheme detailed analysis blocks."""
324
+ elements = [PageBreak(), Paragraph("πŸ” Individual Scheme Analysis", STYLE_H1)]
325
+
326
+ for h in report.holdings:
327
+ fund = h.fund
328
+ if not fund:
329
+ continue
330
+
331
+ # Scheme header
332
+ elements.append(Spacer(1, 3*mm))
333
+ status_color = "#E74C3C" if h.is_underperforming else "#27AE60"
334
+ status_text = "Underperforming vs Benchmark" if h.is_underperforming else "Performing Well"
335
+
336
+ elements.append(Paragraph(
337
+ f"<b>{h.scheme_name}</b> &nbsp;&nbsp;"
338
+ f"<font color='{status_color}' size='8'>[{status_text}]</font>",
339
+ STYLE_H2
340
+ ))
341
+
342
+ # Metrics comparison table
343
+ periods = ["1 Year", "3 Year", "5 Year", "10 Year"]
344
+ cagr_vals = [
345
+ [fund.cagr_1y, fund.cagr_1y_cat, fund.cagr_1y_bm],
346
+ [fund.cagr_3y, fund.cagr_3y_cat, fund.cagr_3y_bm],
347
+ [fund.cagr_5y, fund.cagr_5y_cat, fund.cagr_5y_bm],
348
+ [fund.cagr_10y, fund.cagr_10y_cat, fund.cagr_10y_bm],
349
+ ]
350
+
351
+ cagr_header = ["Period", "Fund CAGR", "Category Avg", "Benchmark"]
352
+ cagr_rows = [cagr_header]
353
+ for period, (f_cagr, cat_cagr, bm_cagr) in zip(periods, cagr_vals):
354
+ cagr_rows.append([
355
+ period,
356
+ _fmt_pct(f_cagr),
357
+ _fmt_pct(cat_cagr),
358
+ _fmt_pct(bm_cagr),
359
+ ])
360
+
361
+ cagr_tbl = Table(cagr_rows, colWidths=[30*mm, 30*mm, 30*mm, 30*mm])
362
+ cagr_style = _table_style(header_color=MID_BLUE)
363
+ # Colour fund CAGR red if below benchmark
364
+ for row_i, (_, (f_cagr, _, bm_cagr)) in enumerate(zip(periods, cagr_vals), 1):
365
+ if f_cagr is not None and bm_cagr is not None:
366
+ color = RED if f_cagr < bm_cagr else GREEN
367
+ cagr_style.add('TEXTCOLOR', (1, row_i), (1, row_i), color)
368
+ cagr_tbl.setStyle(cagr_style)
369
+
370
+ # Risk metrics row
371
+ risk_header = ["Alpha", "Beta", "Sharpe", "Std Dev", "Sortino", "Max DD", "Score"]
372
+ risk_vals = [
373
+ _fmt_pct(fund.alpha), _fmt_float(fund.beta),
374
+ _fmt_float(fund.sharpe), _fmt_pct(fund.std_dev),
375
+ _fmt_float(fund.sortino), _fmt_pct(fund.max_drawdown),
376
+ _fmt_float(fund.score),
377
+ ]
378
+ risk_tbl = Table(
379
+ [risk_header, risk_vals],
380
+ colWidths=[25*mm, 20*mm, 20*mm, 20*mm, 20*mm, 20*mm, 15*mm]
381
+ )
382
+ risk_tbl.setStyle(_table_style(header_color=colors.HexColor("#34495E")))
383
+
384
+ # Charts: bar chart for this scheme
385
+ cagr_chart_data = {
386
+ "1Y": {"fund": fund.cagr_1y, "benchmark": fund.cagr_1y_bm, "category": fund.cagr_1y_cat},
387
+ "3Y": {"fund": fund.cagr_3y, "benchmark": fund.cagr_3y_bm, "category": fund.cagr_3y_cat},
388
+ "5Y": {"fund": fund.cagr_5y, "benchmark": fund.cagr_5y_bm, "category": fund.cagr_5y_cat},
389
+ "10Y": {"fund": fund.cagr_10y, "benchmark": fund.cagr_10y_bm, "category": fund.cagr_10y_cat},
390
+ }
391
+ chart_buf = ch.holding_vs_benchmark_chart(fund.name, cagr_chart_data)
392
+
393
+ row_layout = Table(
394
+ [[cagr_tbl, _img_from_buf(chart_buf, 80, 55)]],
395
+ colWidths=[100*mm, 80*mm]
396
+ )
397
+ row_layout.setStyle(TableStyle([('VALIGN', (0, 0), (-1, -1), 'TOP')]))
398
+
399
+ block = KeepTogether([
400
+ row_layout,
401
+ Spacer(1, 2*mm),
402
+ risk_tbl,
403
+ ])
404
+
405
+ # Switch suggestion section
406
+ if h.suggested_fund:
407
+ sf = h.suggested_fund
408
+ elements.append(block)
409
+ elements.append(Paragraph(
410
+ f"πŸ’‘ <b>Suggested Switch:</b> {h.scheme_name} β†’ <b>{sf.name}</b>",
411
+ STYLE_H2
412
+ ))
413
+ comp_data = [
414
+ ["Metric", "Current Fund", "Suggested Fund"],
415
+ ["3Y CAGR", _fmt_pct(fund.cagr_3y), _fmt_pct(sf.cagr_3y)],
416
+ ["5Y CAGR", _fmt_pct(fund.cagr_5y), _fmt_pct(sf.cagr_5y)],
417
+ ["Alpha", _fmt_pct(fund.alpha), _fmt_pct(sf.alpha)],
418
+ ["Sharpe", _fmt_float(fund.sharpe), _fmt_float(sf.sharpe)],
419
+ ["TER", _fmt_pct(fund.ter), _fmt_pct(sf.ter)],
420
+ ["Score", _fmt_float(fund.score), _fmt_float(sf.score)],
421
+ ]
422
+ comp_tbl = Table(comp_data, colWidths=[40*mm, 60*mm, 60*mm])
423
+ comp_style = _table_style(header_color=colors.HexColor("#8E44AD"))
424
+ comp_tbl.setStyle(comp_style)
425
+ elements.append(comp_tbl)
426
+ else:
427
+ elements.append(block)
428
+
429
+ elements.append(HRFlowable(width="100%", thickness=0.5, color=LIGHT_GREY, spaceAfter=2))
430
+
431
+ return elements
432
+
433
+
434
+ def _build_wealth_projection(report: PortfolioReport) -> List:
435
+ """Wealth projection table and chart."""
436
+ elements = [PageBreak(), Paragraph("πŸ’° Wealth Projection @ 12% p.a.", STYLE_H1)]
437
+
438
+ proj = report.wealth_projection
439
+ if not proj:
440
+ return elements
441
+
442
+ proj_data = [["Time Horizon", "Projected Value", "Approx. Growth"]]
443
+ current = report.total_current_value
444
+ for yr, val in sorted(proj.items()):
445
+ growth = ((val - current) / current * 100) if current else 0
446
+ proj_data.append([f"{yr} Years", _fmt_inr(val), f"+{growth:.1f}%"])
447
+
448
+ proj_tbl = Table(proj_data, colWidths=[40*mm, 60*mm, 40*mm])
449
+ proj_tbl.setStyle(_table_style())
450
+ elements.append(proj_tbl)
451
+ elements.append(Spacer(1, 4*mm))
452
+
453
+ # Chart
454
+ wc_buf = ch.wealth_projection_chart(proj, current)
455
+ elements.append(_img_from_buf(wc_buf, 160, 70))
456
+ elements.append(Spacer(1, 4*mm))
457
+ return elements
458
+
459
+
460
+ DISCLAIMER_TEXT = (
461
+ "Disclaimer: We have gathered all the data, information, and statistics from sources believed to be "
462
+ "highly reliable and true. All necessary precautions have been taken to avoid any error, lapse or "
463
+ "insufficiency; however, no representations or warranties are made (express or implied) as to the "
464
+ "reliability, accuracy or completeness of such information. We cannot be held liable for any loss "
465
+ "arising directly or indirectly from the use of, or any action taken on, any information appearing herein. "
466
+ "The user is advised to verify the contents of the report independently. It is not an Investment recommendation "
467
+ "or personal financial, Investment or professional advice and should not be treated as such. The Risk Level of "
468
+ "any of the schemes must always be commensurate with the risk profile, Investment objective or financial goals "
469
+ "of the investor concerned. Returns less than 1 year are in absolute (%) and greater than 1 year are compounded "
470
+ "annualised (CAGR %). SIP returns are shown in XIRR (%). Mutual Fund Investments are subject to market risks, "
471
+ "read all scheme related documents carefully. Past performance may or may not be sustained in the future."
472
+ )
473
+
474
+
475
+ # ─── Main Generator ──────────────────────────────────────────────────────────
476
+
477
+ def _build_quartile_section(report: PortfolioReport) -> List:
478
+ """
479
+ Quartile Analysis Grid β€” based on senior's handwritten sketch.
480
+ Shows BM / Category / Scheme rows Γ— 1Y/3Y/5Y/10Y columns per holding.
481
+ Scheme row is color-coded: Q1(green)/Q2(blue)/Q3(yellow)/Q4(red).
482
+ """
483
+ elements = [Paragraph("πŸ“Š Quartile Analysis β€” Scheme vs Benchmark & Category", STYLE_H1)]
484
+ elements.append(Paragraph(
485
+ "Each scheme is compared against its Benchmark Index and Category Average "
486
+ "across 1Y/3Y/5Y/10Y periods. The Scheme row shows CAGR and is color-coded "
487
+ "by quartile rank (Q1=Top, Q4=Bottom). βœ“ = Fund beats Benchmark that period.",
488
+ STYLE_SMALL
489
+ ))
490
+ elements.append(Spacer(1, 2*mm))
491
+
492
+ grid_data = []
493
+ for h in report.holdings:
494
+ f = h.fund
495
+ if not f:
496
+ continue
497
+ rank = f.rank_in_category or 1
498
+ total = rank * 4 # approximate β€” will be corrected when fund_universe passed
499
+ grid_data.append({
500
+ "scheme_name": h.scheme_name,
501
+ "rank_in_category": rank,
502
+ "total_in_category": total,
503
+ "cagr_1y": f.cagr_1y, "cagr_1y_bm": f.cagr_1y_bm, "cagr_1y_cat": f.cagr_1y_cat,
504
+ "cagr_3y": f.cagr_3y, "cagr_3y_bm": f.cagr_3y_bm, "cagr_3y_cat": f.cagr_3y_cat,
505
+ "cagr_5y": f.cagr_5y, "cagr_5y_bm": f.cagr_5y_bm, "cagr_5y_cat": f.cagr_5y_cat,
506
+ "cagr_10y": f.cagr_10y,"cagr_10y_bm": f.cagr_10y_bm,"cagr_10y_cat": f.cagr_10y_cat,
507
+ })
508
+
509
+ if grid_data:
510
+ grid_buf = ch.quartile_analysis_grid(grid_data)
511
+ n = len(grid_data)
512
+ chart_h = max(75, n * 28)
513
+ elements.append(_img_from_buf(grid_buf, 175, min(chart_h, 210)))
514
+ else:
515
+ elements.append(Paragraph("No matched fund data available.", STYLE_BODY))
516
+
517
+ elements.append(Spacer(1, 3*mm))
518
+ return elements
519
+
520
+
521
+ def generate_pdf(report: PortfolioReport, output_path: str) -> str:
522
+ """
523
+ Generate the complete PDF report.
524
+ Returns: path to the generated PDF.
525
+ """
526
+ output_path = Path(output_path)
527
+ output_path.parent.mkdir(parents=True, exist_ok=True)
528
+
529
+ doc = SimpleDocTemplate(
530
+ str(output_path),
531
+ pagesize=A4,
532
+ leftMargin=MARGIN,
533
+ rightMargin=MARGIN,
534
+ topMargin=MARGIN,
535
+ bottomMargin=MARGIN,
536
+ )
537
+
538
+ story = []
539
+
540
+ # ── Page 1 ──────────────────────────────────────────────────────────
541
+ story += _build_header(report)
542
+ story += _build_summary(report)
543
+ story += _build_holdings_table(report)
544
+ story += _build_quartile_section(report)
545
+ story += _build_allocation_charts(report)
546
+
547
+ # ── Per-scheme details ───────────────────────────────────────────────
548
+ story += _build_scheme_details(report)
549
+
550
+ # ── Wealth projection ────────────────────────────────────────────────
551
+ story += _build_wealth_projection(report)
552
+
553
+ # ── Disclaimer ───────────────────────────────────────────────────────
554
+ story.append(HRFlowable(width="100%", thickness=0.5, color=LIGHT_GREY))
555
+ story.append(Spacer(1, 3*mm))
556
+ story.append(Paragraph("Disclaimer", STYLE_H2))
557
+ story.append(Paragraph(DISCLAIMER_TEXT, STYLE_DISCLAIMER))
558
+
559
+ doc.build(story)
560
+ return str(output_path)
src/portfolio_engine.py ADDED
@@ -0,0 +1,299 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Portfolio Engine: -p mode
3
+
4
+ Loads a client CSV, matches holdings to the fund universe,
5
+ computes portfolio metrics, exposure checks, and wealth projection.
6
+ """
7
+
8
+ import csv
9
+ import numpy as np
10
+ from pathlib import Path
11
+ from typing import List, Optional, Dict
12
+ from src.models import Fund, Client, ClientHolding, Advisor, PortfolioReport
13
+
14
+
15
+ # ─── Client CSV Loader ───────────────────────────────────────────────────────
16
+
17
+ def load_client_csv(csv_path: str) -> tuple[Client, List[ClientHolding]]:
18
+ """
19
+ Load client data from CSV.
20
+
21
+ Expected CSV format:
22
+ Line 1: Name, Age, Email, Mobile[, PAN]
23
+ Line 2+: Scheme Name, Current Value, Invested Amount, SIP Amount, SIP Frequency
24
+
25
+ Example:
26
+ Parthiban,45,parthiban@gmail.com,9876543210,ABCDE1234F
27
+ Nippon India Small Cap Fund,280923,200000,5000,Monthly
28
+ HDFC Mid Cap Fund,134562,120000,3000,Monthly
29
+ """
30
+ csv_path = Path(csv_path)
31
+ if not csv_path.exists():
32
+ raise FileNotFoundError(f"Client CSV not found: {csv_path}")
33
+
34
+ with open(csv_path, encoding='utf-8-sig', errors='replace') as f:
35
+ reader = csv.reader(f)
36
+ rows = [r for r in reader if any(c.strip() for c in r)]
37
+
38
+ if not rows:
39
+ raise ValueError("Client CSV is empty")
40
+
41
+ # Parse client info from first row
42
+ info = rows[0]
43
+ client = Client(
44
+ name=info[0].strip() if len(info) > 0 else "Unknown",
45
+ age=int(info[1]) if len(info) > 1 and info[1].strip().isdigit() else None,
46
+ email=info[2].strip() if len(info) > 2 else None,
47
+ mobile=info[3].strip() if len(info) > 3 else None,
48
+ pan=info[4].strip() if len(info) > 4 else None,
49
+ )
50
+
51
+ # Parse holdings from remaining rows
52
+ holdings: List[ClientHolding] = []
53
+ for row in rows[1:]:
54
+ if not row or not row[0].strip():
55
+ continue
56
+ # Skip header-like rows
57
+ if row[0].strip().lower() in ('scheme name', 'fund', 'scheme'):
58
+ continue
59
+
60
+ def safe_float(v):
61
+ try:
62
+ return float(str(v).replace(',', '').strip())
63
+ except (ValueError, TypeError):
64
+ return None
65
+
66
+ holding = ClientHolding(
67
+ scheme_name=row[0].strip(),
68
+ current_value=safe_float(row[1]) or 0.0,
69
+ invested_amount=safe_float(row[2]) if len(row) > 2 else None,
70
+ sip_amount=safe_float(row[3]) if len(row) > 3 else None,
71
+ sip_frequency=row[4].strip() if len(row) > 4 else None,
72
+ )
73
+ holdings.append(holding)
74
+
75
+ return client, holdings
76
+
77
+
78
+ # ─── Fund Matcher ────────────────────────────────────────────────────────────
79
+
80
+ def match_holdings_to_funds(holdings: List[ClientHolding], funds: List[Fund]) -> List[ClientHolding]:
81
+ """
82
+ Fuzzy-match each client holding to a fund in the universe.
83
+ Uses token overlap on lowercased fund names.
84
+ """
85
+ def tokenize(name: str) -> set:
86
+ stopwords = {'fund', 'regular', 'plan', 'growth', 'option', 'direct',
87
+ 'idcw', 'div', 'dividend', '-', 'the', 'india', 'of'}
88
+ tokens = set(name.lower().replace('-', ' ').split())
89
+ return tokens - stopwords
90
+
91
+ fund_tokens = [(f, tokenize(f.name)) for f in funds]
92
+
93
+ for holding in holdings:
94
+ h_tokens = tokenize(holding.scheme_name)
95
+ if not h_tokens:
96
+ continue
97
+
98
+ best_fund = None
99
+ best_score = 0
100
+
101
+ for fund, f_tokens in fund_tokens:
102
+ if not f_tokens:
103
+ continue
104
+ intersection = len(h_tokens & f_tokens)
105
+ union = len(h_tokens | f_tokens)
106
+ jaccard = intersection / union if union else 0
107
+
108
+ if jaccard > best_score:
109
+ best_score = jaccard
110
+ best_fund = fund
111
+
112
+ if best_score > 0.15: # minimum match threshold
113
+ holding.fund = best_fund
114
+
115
+ return holdings
116
+
117
+
118
+ # ─── Portfolio Analysis ──────────────────────────────────────────────────────
119
+
120
+ def compute_allocation(holdings: List[ClientHolding]) -> List[ClientHolding]:
121
+ """Compute each holding's % allocation of total portfolio."""
122
+ total = sum(h.current_value for h in holdings)
123
+ if total == 0:
124
+ return holdings
125
+ for h in holdings:
126
+ h.allocation_pct = round((h.current_value / total) * 100, 2)
127
+ return holdings
128
+
129
+
130
+ def check_exposure(holdings: List[ClientHolding]) -> tuple[Dict, Dict, List[str]]:
131
+ """
132
+ Check AMC and scheme-level exposure.
133
+ Returns (amc_exposure, scheme_exposure, warnings).
134
+ """
135
+ total = sum(h.current_value for h in holdings)
136
+ if total == 0:
137
+ return {}, {}, []
138
+
139
+ amc_exposure: Dict[str, float] = {}
140
+ scheme_exposure: Dict[str, float] = {}
141
+ warnings: List[str] = []
142
+
143
+ for h in holdings:
144
+ pct = h.allocation_pct
145
+ scheme_exposure[h.scheme_name] = pct
146
+
147
+ # Extract AMC name (first word(s) before "-")
148
+ amc = h.scheme_name.split('-')[0].strip()
149
+ amc_exposure[amc] = amc_exposure.get(amc, 0) + pct
150
+
151
+ THRESHOLD = 20.0
152
+
153
+ for amc, pct in amc_exposure.items():
154
+ if pct > THRESHOLD:
155
+ warnings.append(f"⚠️ AMC Exposure Alert: {amc} = {pct:.1f}% (>{THRESHOLD}% threshold)")
156
+
157
+ for scheme, pct in scheme_exposure.items():
158
+ if pct > THRESHOLD:
159
+ warnings.append(f"⚠️ Scheme Exposure Alert: {scheme} = {pct:.1f}% (>{THRESHOLD}% threshold)")
160
+
161
+ return amc_exposure, scheme_exposure, warnings
162
+
163
+
164
+ def compute_portfolio_metrics(holdings: List[ClientHolding]) -> Dict:
165
+ """
166
+ Compute portfolio-level weighted average risk metrics.
167
+ """
168
+ total = sum(h.current_value for h in holdings)
169
+ if total == 0:
170
+ return {}
171
+
172
+ metrics = {"sharpe": 0.0, "alpha": 0.0, "beta": 0.0, "std_dev": 0.0}
173
+
174
+ for h in holdings:
175
+ w = h.current_value / total
176
+ if h.fund:
177
+ if h.fund.sharpe is not None:
178
+ metrics["sharpe"] += w * h.fund.sharpe
179
+ if h.fund.alpha is not None:
180
+ metrics["alpha"] += w * h.fund.alpha
181
+ if h.fund.beta is not None:
182
+ metrics["beta"] += w * h.fund.beta
183
+ if h.fund.std_dev is not None:
184
+ metrics["std_dev"] += w * h.fund.std_dev
185
+
186
+ return {k: round(v, 4) for k, v in metrics.items()}
187
+
188
+
189
+ def flag_underperformers(holdings: List[ClientHolding]) -> List[ClientHolding]:
190
+ """
191
+ Flag a holding as underperforming if its fund's CAGR fails to outperform
192
+ EITHER the BM Index OR the Category Average across multiple time periods.
193
+
194
+ Rule (from senior advisor's framework):
195
+ A fund's CAGR should:
196
+ 1. Outperform the BM Index across time periods (1Y, 3Y, 5Y)
197
+ 2. Outperform the category average across time periods
198
+ 3. Have superior risk metrics (handled separately via score)
199
+
200
+ A fund is flagged if it underperforms on 2+ out of 3 periods
201
+ on EITHER benchmark OR category average.
202
+ """
203
+ PERIODS = [
204
+ ("1Y", "cagr_1y", "cagr_1y_bm", "cagr_1y_cat"),
205
+ ("3Y", "cagr_3y", "cagr_3y_bm", "cagr_3y_cat"),
206
+ ("5Y", "cagr_5y", "cagr_5y_bm", "cagr_5y_cat"),
207
+ ]
208
+
209
+ for h in holdings:
210
+ f = h.fund
211
+ if not f:
212
+ continue
213
+
214
+ bm_fails = 0
215
+ cat_fails = 0
216
+ checked = 0
217
+
218
+ for label, cagr_attr, bm_attr, cat_attr in PERIODS:
219
+ fund_cagr = getattr(f, cagr_attr, None)
220
+ bm_cagr = getattr(f, bm_attr, None)
221
+ cat_cagr = getattr(f, cat_attr, None)
222
+
223
+ if fund_cagr is None:
224
+ continue
225
+ checked += 1
226
+ if bm_cagr is not None and fund_cagr < bm_cagr:
227
+ bm_fails += 1
228
+ if cat_cagr is not None and fund_cagr < cat_cagr:
229
+ cat_fails += 1
230
+
231
+ # Flag if underperforms BM on 2+ periods OR underperforms category on 2+ periods
232
+ if checked > 0 and (bm_fails >= 2 or cat_fails >= 2):
233
+ h.is_underperforming = True
234
+
235
+ return holdings
236
+
237
+
238
+ def compute_wealth_projection(total_value: float, years_list: list = [5, 10, 15, 20],
239
+ rate: float = 0.12) -> Dict:
240
+ """Project portfolio value at a fixed annual return rate."""
241
+ return {
242
+ yr: round(total_value * ((1 + rate) ** yr), 2)
243
+ for yr in years_list
244
+ }
245
+
246
+
247
+ # ─── Main entry ──────────────────────────────────────────────────────────────
248
+
249
+ def run_portfolio_engine(
250
+ client_csv: str,
251
+ fund_universe: List[Fund],
252
+ advisor: Optional[Advisor] = None,
253
+ ) -> PortfolioReport:
254
+ """
255
+ Full pipeline: load client β†’ match funds β†’ analyse β†’ build report object.
256
+ """
257
+ if advisor is None:
258
+ advisor = Advisor()
259
+
260
+ print(f"πŸ“‚ Loading client data from: {client_csv}")
261
+ client, holdings = load_client_csv(client_csv)
262
+ print(f" Client: {client.name} | Holdings: {len(holdings)}")
263
+
264
+ print("πŸ”— Matching holdings to fund universe...")
265
+ holdings = match_holdings_to_funds(holdings, fund_universe)
266
+ matched = sum(1 for h in holdings if h.fund is not None)
267
+ print(f" Matched {matched}/{len(holdings)} holdings")
268
+
269
+ holdings = compute_allocation(holdings)
270
+ amc_exp, scheme_exp, warnings = check_exposure(holdings)
271
+ holdings = flag_underperformers(holdings)
272
+ metrics = compute_portfolio_metrics(holdings)
273
+
274
+ total_current = sum(h.current_value for h in holdings)
275
+ total_invested = sum(h.invested_amount or 0 for h in holdings)
276
+
277
+ wealth_projection = compute_wealth_projection(total_current)
278
+
279
+ report = PortfolioReport(
280
+ client=client,
281
+ advisor=advisor,
282
+ holdings=holdings,
283
+ total_current_value=total_current,
284
+ total_invested=total_invested,
285
+ unrealized_gain=total_current - total_invested,
286
+ sharpe=metrics.get("sharpe"),
287
+ alpha=metrics.get("alpha"),
288
+ beta=metrics.get("beta"),
289
+ std_dev=metrics.get("std_dev"),
290
+ amc_exposure=amc_exp,
291
+ scheme_exposure=scheme_exp,
292
+ exposure_warnings=warnings,
293
+ wealth_projection=wealth_projection,
294
+ )
295
+
296
+ if warnings:
297
+ print("\n".join(warnings))
298
+
299
+ return report
src/reference_data.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Reference data extractor from Processed_data.xlsx
3
+
4
+ This module extracts BM Index, Category Average, and fund weightage data that the advisor
5
+ has manually filled in Processed_data.xlsx, so we can use it when processing
6
+ raw CSV files that have blank BM/Category rows.
7
+ """
8
+
9
+ import openpyxl
10
+ from typing import Dict, Any, Optional, Tuple
11
+ from pathlib import Path
12
+
13
+
14
+ def extract_reference_data(processed_xlsx_path: str) -> Tuple[Dict[str, Dict[str, Any]], Dict[str, Dict[str, Any]], Dict[str, int]]:
15
+ """
16
+ Extract BM Index, Category Average, and fund weightage data from Processed_data.xlsx.
17
+
18
+ Returns:
19
+ (bm_data, cat_avg_data, fund_weightages) where:
20
+ - bm_data: dict mapping category name to CAGR values for BM Index
21
+ - cat_avg_data: dict mapping category name to CAGR/values for Category Average
22
+ - fund_weightages: dict mapping fund name to manually adjusted weightage value
23
+ """
24
+ xlsx_path = Path(processed_xlsx_path)
25
+ if not xlsx_path.exists():
26
+ print(f"Warning: Reference file not found: {processed_xlsx_path}")
27
+ return {}, {}, {}
28
+
29
+ wb = openpyxl.load_workbook(str(xlsx_path))
30
+ ws = wb.active
31
+
32
+ bm_data = {}
33
+ cat_avg_data = {}
34
+ fund_weightages = {}
35
+ current_category = None
36
+
37
+ # Find the Weightage column index by scanning the header row
38
+ weightage_col_idx = None
39
+ for col_idx in range(1, ws.max_column + 1):
40
+ header_val = ws.cell(1, col_idx).value
41
+ if header_val and 'Weightage' in str(header_val):
42
+ weightage_col_idx = col_idx
43
+ break
44
+
45
+ for i in range(1, ws.max_row + 1):
46
+ cell_val = ws.cell(i, 1).value
47
+
48
+ # Check if it's a category header
49
+ if cell_val and ':' in str(cell_val) and any(x in str(cell_val) for x in ['Equity', 'Debt', 'Hybrid', 'Solution', 'Other']):
50
+ current_category = cell_val
51
+
52
+ # Check if it's BM Index row
53
+ elif cell_val == 'BM Index' and current_category:
54
+ bm_1y = ws.cell(i, 6).value
55
+ bm_3y = ws.cell(i, 7).value
56
+ bm_5y = ws.cell(i, 8).value
57
+ bm_10y = ws.cell(i, 9).value
58
+
59
+ # Only store if at least one value is present
60
+ if any([bm_1y, bm_3y, bm_5y, bm_10y]):
61
+ bm_data[current_category] = {
62
+ 'cagr_1y': bm_1y,
63
+ 'cagr_3y': bm_3y,
64
+ 'cagr_5y': bm_5y,
65
+ 'cagr_10y': bm_10y
66
+ }
67
+
68
+ # Check if it's Category Average row
69
+ elif cell_val == 'Category Average' and current_category:
70
+ cat_1y = ws.cell(i, 6).value
71
+ cat_3y = ws.cell(i, 7).value
72
+ cat_5y = ws.cell(i, 8).value
73
+ cat_10y = ws.cell(i, 9).value
74
+ pe = ws.cell(i, 12).value
75
+ pb = ws.cell(i, 13).value
76
+
77
+ # Only store if at least one CAGR value is present
78
+ if any([cat_1y, cat_3y, cat_5y, cat_10y]):
79
+ cat_avg_data[current_category] = {
80
+ 'cagr_1y': cat_1y,
81
+ 'cagr_3y': cat_3y,
82
+ 'cagr_5y': cat_5y,
83
+ 'cagr_10y': cat_10y,
84
+ 'pe_ratio': pe,
85
+ 'pb_ratio': pb
86
+ }
87
+
88
+ # Check if it's a fund row (not category header, BM Index, or Category Average)
89
+ elif cell_val and cell_val not in ['BM Index', 'Category Average', 'Fund'] and current_category:
90
+ # Extract fund name
91
+ fund_name = str(cell_val).strip()
92
+
93
+ # Extract weightage if we found the Weightage column
94
+ if weightage_col_idx:
95
+ weightage_val = ws.cell(i, weightage_col_idx).value
96
+ if weightage_val is not None:
97
+ try:
98
+ # Convert to int if possible, otherwise round float to nearest int
99
+ if isinstance(weightage_val, float):
100
+ fund_weightages[fund_name] = int(round(weightage_val))
101
+ else:
102
+ fund_weightages[fund_name] = int(weightage_val)
103
+ except (ValueError, TypeError):
104
+ # If conversion fails, skip this fund
105
+ pass
106
+
107
+ wb.close()
108
+
109
+ print(f"Loaded reference data: {len(bm_data)} categories with BM Index, {len(cat_avg_data)} with Category Average, {len(fund_weightages)} fund weightages")
110
+
111
+ return bm_data, cat_avg_data, fund_weightages
112
+
113
+
114
+ def get_fund_weightage_from_reference(fund_name: str, fund_weightages: Dict[str, int]) -> Optional[int]:
115
+ """
116
+ Get the manually adjusted weightage for a fund from reference data.
117
+
118
+ Args:
119
+ fund_name: Name of the fund
120
+ fund_weightages: Dictionary of fund name to weightage from Processed_data.xlsx
121
+
122
+ Returns:
123
+ Weightage value if found, None otherwise
124
+ """
125
+ # Try exact match first
126
+ if fund_name in fund_weightages:
127
+ return fund_weightages[fund_name]
128
+
129
+ # Try partial match (in case of slight name differences)
130
+ for ref_fund_name, weightage in fund_weightages.items():
131
+ if fund_name.lower() in ref_fund_name.lower() or ref_fund_name.lower() in fund_name.lower():
132
+ return weightage
133
+
134
+ return None
135
+
136
+
137
+ # Default reference file path
138
+ DEFAULT_REFERENCE_PATH = "PS/Processed data.xlsx"
139
+
140
+
141
+ # Default reference file path
142
+ DEFAULT_REFERENCE_PATH = "PS/Processed data.xlsx"
src/scheme_resolver.py ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Scheme Code Resolver
2
+ ======================
3
+ Resolves missing AMFI scheme codes by fuzzy-matching the fund name from the
4
+ CSV against mfapi.in's /mf/search endpoint.
5
+
6
+ This runs as a PRE-TRIAGE step so that the NAV engine can fire for funds whose
7
+ scheme code was absent from the CSV.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import difflib
13
+ import re
14
+ import time
15
+
16
+ import requests
17
+
18
+
19
+ MFAPI_SEARCH = "https://api.mfapi.in/mf/search"
20
+ MATCH_CUTOFF = 0.52 # minimum SequenceMatcher ratio to accept
21
+ SLEEP_BETWEEN = 0.25 # seconds between API calls (polite rate limit)
22
+
23
+ # Manual overrides for schemes that mfapi's search endpoint does not
24
+ # currently return, but whose AMFI codes are known and stable. Keys are
25
+ # normalized fund names (see _normalize).
26
+ SCHEME_OVERRIDES: dict[str, str] = {
27
+ # ── Pre-verified from AMFI NAV master (portal.amfiindia.com) ──────────────
28
+ # These funds have empty scheme codes in source CSV and cannot be reliably
29
+ # resolved via mfapi fuzzy search. Codes are Regular Plan - Growth only.
30
+
31
+ # Existing override
32
+ "kotak tax saver scheme growth": "109234",
33
+
34
+ # ── Debt: Banking and PSU ─────────────────────────────────────────────────
35
+ "hdfc banking and psu debt fund growth option": "128628",
36
+ "icici prudential banking and psu debt fund growth": "112342",
37
+ "kotak banking and psu debt growth": "123690",
38
+ "invesco india banking and psu fund growth option": "118232",
39
+ "sundaram banking psu fund formerly known as sundaram banking and psu debt fund regular plan growth": "100784",
40
+ "hsbc banking and psu debt fund regular growth": "151104",
41
+ "iti banking psu debt fund regular plan growth option": "148535",
42
+
43
+ # ── Debt: Liquid ──────────────────────────────────────────────────────────
44
+ "dsp liquidity fund regular plan growth": "119120",
45
+ "invesco india liquid fund growth": "104488",
46
+ "invesco india liquid fund regular growth": "118769",
47
+ "union liquid fund growth option": "115398",
48
+ "parag parikh liquid fund regular plan growth": "149038",
49
+ "motilal oswal liquid fund regular growth": "147622",
50
+ "iti liquid fund regular plan growth option": "147153",
51
+ "quantum liquid fund regular plan growth option": "103504",
52
+ "lic mf liquid fund regular plan growth": "120716",
53
+ "icici prudential liquid fund growth": "120593",
54
+ "aditya birla sun life liquid fund retail growth": "100042",
55
+ "aditya birla sun life liquid fund growth": "100047",
56
+ "edelweiss liquid fund regular plan growth option": "140182",
57
+ "edelweiss liquid fund retail plan growth option": "119114",
58
+ "axis liquid fund retail plan growth option": "112090",
59
+ "sbi liquid fund regular plan growth": "119822",
60
+ "nippon india liquid fund retail option growth plan": "100837",
61
+
62
+ # ── Debt: Overnight ───────────────────────────────────────────────────────
63
+ "uti overnight fund regular plan growth option": "100814",
64
+ "canara robeco overnight fund regular plan growth option": "147534",
65
+ "dsp overnight fund regular plan growth": "146061",
66
+ "franklin india overnight fund growth": "146210",
67
+ "bandhan overnight fund regular plan growth": "146187",
68
+ "iti overnight fund regular plan growth option": "148529",
69
+ "union overnight fund regular plan growth option": "146997",
70
+ "icici prudential overnight fund growth": "145811",
71
+ "edelweiss overnight fund regular plan growth": "147569",
72
+ "lic mf overnight fund regular plan growth": "146065",
73
+ "hdfc overnight fund growth option": "145822",
74
+
75
+ # ── Debt: Ultra Short Duration ────────────────────────────────────────────
76
+ "icici prudential ultra short term fund growth": "120505",
77
+ "invesco india ultra short duration fund growth": "117825",
78
+ "uti ultra short duration fund regular plan growth option": "102532",
79
+ "aditya birla sun life savings fund growth regular plan": "119293",
80
+ "aditya birla sun life savings fund retail growth": "119293",
81
+ "hdfc ultra short term fund growth option": "145539",
82
+ "aditya birla sun life savings fund discipline advantage plan": "112016",
83
+ "pgim india ultra short duration fund growth": "100474",
84
+ "iti ultra short duration fund regular plan growth option": "148533",
85
+ "motilal oswal ultra short term fund mofustf regular plan growth": "124233",
86
+ "tata ultra short term fund regular plan growth": "146070",
87
+ "kotak savings fund growth": "119270",
88
+ "lic mf ultra short duration fund regular plan growth": "147770",
89
+ "canara robeco ultra short term fund regular plan growth option": "119671",
90
+ "sundaram ultra short duration fund formerly known as principal ultra short term fund growth option": "120826",
91
+ "bank of india ultra short duration fund regular plan growth": "109269",
92
+
93
+ # ── Debt: Short Duration ──────────────────────────────────────────────────
94
+ "hdfc short term debt fund growth option": "119247",
95
+ "icici prudential short term fund growth option": "101758",
96
+ "sbi short horizon debt fund short term fund retail growth": "106227",
97
+ "sbi short term debt fund regular plan growth": "119831",
98
+ "kotak bond short term plan growth": "101373",
99
+ "dsp short term fund regular plan growth": "119598",
100
+ "lic mf short duration fund regular plan growth": "145952",
101
+ "mirae asset short duration fund regular plan growth": "148416",
102
+ "invesco india short duration fund growth": "105185",
103
+ "canara robeco short duration fund regular plan growth option": "119675",
104
+ "groww short duration fund formerly known as indiabulls short term fund regular plan growth option": "123708",
105
+ "tata short term bond fund regular plan growth option": "119802",
106
+
107
+ # ── Debt: Medium Duration ─────────────────────────────────────────────────
108
+ "aditya birla sun life medium term plan growth regular plan": "111803",
109
+ "axis strategic bond fund regular plan growth option": "116894",
110
+ "icici prudential medium term bond fund growth": "120841",
111
+ "hdfc medium term debt fund growth option": "119238",
112
+ "kotak medium term fund growth": "119281",
113
+ "dsp bond fund growth": "100078",
114
+ "sundaram medium duration fund formerly known as sundaram medium term bond fund regular plan growth": "100603",
115
+
116
+ # ── ETFs ──────────────────────────────────────────────────────────────────
117
+ "hdfc nifty100 low volatility 30 etf growth option": "145748",
118
+ "hdfc nifty200 momentum 30 etf growth option": "146058",
119
+ "hdfc nifty it etf growth option": "120493",
120
+ "hdfc nifty private bank etf growth option": "145696",
121
+
122
+ # ── Index Funds ───────────────────────────────────────────────────────────
123
+ "dsp nifty next 50 index fund regular plan growth": "143669",
124
+ "uti nifty next 50 index fund regular plan growth option": "120713",
125
+ "motilal oswal nifty smallcap 250 index regular plan": "147960",
126
+ "icici prudential nifty pharma index fund growth": "143874",
127
+ "dsp nifty 50 index fund regular plan growth": "143537",
128
+ "motilal oswal nifty midcap 150 index fund regular plan": "147068",
129
+ "sbi nifty index fund regular plan growth": "135818",
130
+ "motilal oswal nifty bank index regular plan": "145552",
131
+ }
132
+
133
+
134
+ def _normalize(name: str) -> str:
135
+ """Convert hyphenated CSV name to a clean lowercase string."""
136
+ return re.sub(r"[-_]+", " ", name).strip().lower()
137
+
138
+
139
+ def _search_query(name: str) -> str:
140
+ """Take first 6 tokens for a focused search query."""
141
+ return " ".join(_normalize(name).split()[:6])
142
+
143
+
144
+ def _search_mfapi(query: str) -> list[dict]:
145
+ try:
146
+ resp = requests.get(MFAPI_SEARCH, params={"q": query}, timeout=15)
147
+ resp.raise_for_status()
148
+ return resp.json()
149
+ except Exception as exc:
150
+ print(f" [resolver] search error for '{query}': {exc}")
151
+ return []
152
+
153
+
154
+ def _best_match(candidates: list[dict], target_name: str) -> dict | None:
155
+ if not candidates:
156
+ return None
157
+ target = _normalize(target_name)
158
+ best_score = 0.0
159
+ best_item = None
160
+ for item in candidates:
161
+ candidate = _normalize(item.get("schemeName", ""))
162
+ score = difflib.SequenceMatcher(None, target, candidate).ratio()
163
+ if score > best_score:
164
+ best_score = score
165
+ best_item = item
166
+ if best_score >= MATCH_CUTOFF:
167
+ return best_item
168
+ return None
169
+
170
+
171
+ def _is_valid_scheme_code(code: str) -> bool:
172
+ """AMFI scheme codes are purely numeric (e.g. 120586). Platform codes like GROWWEH are invalid."""
173
+ return bool(code and code.isdigit())
174
+
175
+
176
+ def resolve_scheme_code_for_fund_name(
177
+ fund_name: str,
178
+ ) -> tuple[str | None, str | None]:
179
+ """
180
+ Resolve a scheme code for one fund name.
181
+
182
+ Resolution order:
183
+ 1. Exact normalized-name override from SCHEME_OVERRIDES
184
+ 2. mfapi search + fuzzy best-match
185
+ """
186
+ norm = _normalize(fund_name)
187
+ override_code = SCHEME_OVERRIDES.get(norm)
188
+ if override_code:
189
+ return override_code, "override"
190
+
191
+ query = _search_query(fund_name)
192
+ candidates = _search_mfapi(query)
193
+ match = _best_match(candidates, fund_name)
194
+ if match:
195
+ return str(match["schemeCode"]), match.get("schemeName", "")
196
+ return None, None
197
+
198
+
199
+ def resolve_missing_scheme_codes(
200
+ rows: list[dict[str, str]],
201
+ *,
202
+ verbose: bool = True,
203
+ ) -> tuple[list[dict[str, str]], dict[str, str]]:
204
+ """
205
+ Resolve blank scheme codes and also correct any exact-name rows whose
206
+ current numeric code disagrees with SCHEME_OVERRIDES.
207
+
208
+ Blank/invalid codes are resolved via SCHEME_OVERRIDES (O(1) dict lookup)
209
+ first, then mfapi search in parallel.
210
+
211
+ Complexity: O(N) time, O(N) space where N = funds with missing codes.
212
+ Network I/O parallelised with ThreadPoolExecutor(20) β€” pure I/O bound.
213
+ """
214
+ from concurrent.futures import ThreadPoolExecutor, as_completed
215
+
216
+ resolved: dict[str, str] = {}
217
+ corrected_existing = 0
218
+
219
+ # ── Collect rows that need resolution ─────────────────────────────────────
220
+ target_rows: list[dict[str, str]] = []
221
+ for row in rows:
222
+ fund_name = (row.get("Fund") or "").strip()
223
+ if not fund_name or fund_name.count("-") < 2 or ":" in fund_name:
224
+ continue
225
+ norm = _normalize(fund_name)
226
+ raw_code = (row.get("Scheme Code") or "").strip()
227
+ override_code = SCHEME_OVERRIDES.get(norm)
228
+
229
+ # Future-proofing: if we know the canonical code for this exact fund name,
230
+ # correct it even when the CSV already contains a numeric but stale code.
231
+ if override_code and raw_code != override_code:
232
+ row["Scheme Code"] = override_code
233
+ resolved[fund_name] = override_code
234
+ corrected_existing += 1
235
+ continue
236
+
237
+ if _is_valid_scheme_code(raw_code):
238
+ continue
239
+ if raw_code and not _is_valid_scheme_code(raw_code):
240
+ row["Scheme Code"] = "" # clear invalid platform codes e.g. GROWWEH
241
+ target_rows.append(row)
242
+
243
+ total_missing = len(target_rows)
244
+ if total_missing == 0:
245
+ if verbose:
246
+ if corrected_existing:
247
+ print(f"[resolver] Corrected {corrected_existing} existing scheme codes via override table.")
248
+ else:
249
+ print("[resolver] No missing scheme codes found.")
250
+ return rows, resolved
251
+
252
+ if verbose:
253
+ print(f"[resolver] Resolving {total_missing} missing scheme codes (parallel)…")
254
+
255
+ # ── Phase A: Override table β€” O(1) per fund, no network ───────────────────
256
+ mfapi_needed: list[dict[str, str]] = []
257
+ override_count = 0
258
+
259
+ for row in target_rows:
260
+ fund_name = (row.get("Fund") or "").strip()
261
+ norm = _normalize(fund_name)
262
+ code = SCHEME_OVERRIDES.get(norm)
263
+ if code:
264
+ row["Scheme Code"] = code
265
+ resolved[fund_name] = code
266
+ override_count += 1
267
+ else:
268
+ mfapi_needed.append(row)
269
+
270
+ if verbose and override_count:
271
+ print(f" [resolver] {override_count} resolved via override table (instant)")
272
+ if verbose and corrected_existing:
273
+ print(f" [resolver] {corrected_existing} existing codes corrected via override table")
274
+
275
+ # ── Phase B: mfapi search β€” parallel ThreadPoolExecutor ───────────────────
276
+ if not mfapi_needed:
277
+ if verbose:
278
+ print(f"[resolver] Done. {len(resolved)}/{total_missing} resolved.")
279
+ return rows, resolved
280
+
281
+ lock = __import__("threading").Lock()
282
+ completed = [0]
283
+
284
+ def _resolve_one(row: dict[str, str]) -> tuple[str, str | None, str | None]:
285
+ """Returns (fund_name, scheme_code_or_None, matched_name_or_None)."""
286
+ fund_name = (row.get("Fund") or "").strip()
287
+ query = _search_query(fund_name)
288
+ candidates = _search_mfapi(query)
289
+ match = _best_match(candidates, fund_name)
290
+ if match:
291
+ return fund_name, str(match["schemeCode"]), match.get("schemeName", "")
292
+ return fund_name, None, None
293
+
294
+ # 20 workers: mfapi is pure REST, stateless, handles concurrency fine
295
+ with ThreadPoolExecutor(max_workers=20) as executor:
296
+ future_to_row = {executor.submit(_resolve_one, row): row for row in mfapi_needed}
297
+ for future in as_completed(future_to_row):
298
+ row = future_to_row[future]
299
+ fund_name = (row.get("Fund") or "").strip()
300
+ try:
301
+ _, code, matched_name = future.result()
302
+ except Exception:
303
+ code = matched_name = None
304
+
305
+ with lock:
306
+ completed[0] += 1
307
+ n = completed[0]
308
+ total_mfapi = len(mfapi_needed)
309
+ if code:
310
+ row["Scheme Code"] = code
311
+ resolved[fund_name] = code
312
+ if verbose:
313
+ print(f" [{n}/{total_mfapi}] OK {fund_name[:55]}")
314
+ print(f" -> [{code}] {(matched_name or '')[:55]}")
315
+ else:
316
+ if verbose:
317
+ print(f" [{n}/{total_mfapi}] NO {fund_name[:55]} -- no match")
318
+
319
+ if verbose:
320
+ print(f"[resolver] Done. {len(resolved)}/{total_missing} resolved "
321
+ f"({override_count} overrides + {len(resolved)-override_count-corrected_existing} mfapi"
322
+ f"{f', {corrected_existing} corrected existing codes' if corrected_existing else ''}).")
323
+ return rows, resolved
src/weightage.py ADDED
@@ -0,0 +1,425 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Weightage scoring algorithm for mutual fund schemes.
3
+
4
+ Scoring method: Sum of column weights where cell qualifies for Light Green (Top/Bottom 10)
5
+ AND is NOT overridden by Light Red fill (threshold violations).
6
+
7
+ Weight Distribution (Advisor-revised, March 2026):
8
+ 1. Sortino Ratio: 1.300 (Top 10, higher is better)
9
+ 2. Sharpe Ratio: 1.200 (Top 10, higher is better)
10
+ 3. Information Ratio: 1.000 (Top 10, higher is better, Light Red if < 0)
11
+ 4. Alpha: 1.000 (Top 10, higher is better, Light Red if < 1)
12
+ 5. Maximum Drawdown: 1.350 (Top 10, closest to 0 is better)
13
+ 6. Down Market Capture: 1.000 (Bottom 10, lower is better)
14
+ 7. Standard Deviation: 1.000 (Bottom 10, lower is better)
15
+ 8. 10 Years CAGR: 0.750 (Top 10, higher is better, Light Red if < Category Avg)
16
+ 9. 5 Years CAGR: 0.600 (Top 10, higher is better, Light Red if < Category Avg)
17
+ 10. 3 Years CAGR: 0.400 (Top 10, higher is better, Light Red if < Category Avg)
18
+ 11. P/E Ratio: 0.150 (Bottom 10, lower is better)
19
+ 12. TER: 0.150 (Bottom 10, lower is better)
20
+ 13. Turnover (%): 0.100 (Bottom 10, lower is better)
21
+
22
+ Total: 10.000
23
+ """
24
+
25
+ import math
26
+ from typing import List, Optional, Dict
27
+ from src.models import Fund
28
+
29
+
30
+ # ─── Weight map (Advisor-revised March 2026) ─────────────────────────────────
31
+ WEIGHTS: Dict[str, float] = {
32
+ "sortino": 1.30,
33
+ "sharpe": 1.20,
34
+ "info_ratio": 1.00,
35
+ "alpha": 1.00,
36
+ "max_drawdown": 1.35,
37
+ "down_capture": 1.00,
38
+ "std_dev": 1.00,
39
+ "cagr_10y": 0.75,
40
+ "cagr_5y": 0.60,
41
+ "cagr_3y": 0.40,
42
+ "pe_ratio": 0.15,
43
+ "ter": 0.15,
44
+ "turnover": 0.10,
45
+ }
46
+
47
+ # Sanity-check: total should equal 10.000
48
+ _TOTAL = round(sum(WEIGHTS.values()), 3)
49
+ assert _TOTAL == 10.000, f"WEIGHTS do not sum to 10.000 β€” got {_TOTAL}"
50
+
51
+ # Metrics where higher is better β†’ Top 10
52
+ TOP_10_METRICS = [
53
+ "sharpe", "sortino", "alpha",
54
+ "info_ratio", "max_drawdown",
55
+ "cagr_3y", "cagr_5y", "cagr_10y",
56
+ ]
57
+
58
+ # Metrics where lower is better β†’ Bottom 10
59
+ BOTTOM_10_METRICS = [
60
+ "ter", "turnover", "std_dev",
61
+ "down_capture", "pe_ratio",
62
+ ]
63
+
64
+ # Dual-condition metrics: qualifies for green AND may trigger light-red override
65
+ DUAL_CONDITION_RULES: Dict[str, tuple] = {
66
+ "alpha": ("below_value", 1), # Light Red if alpha < 1%
67
+ "info_ratio": ("below_value", 0), # Light Red if IR < 0
68
+ "cagr_3y": ("below_category_avg", None), # Light Red if < category avg
69
+ "cagr_5y": ("below_category_avg", None),
70
+ "cagr_10y": ("below_category_avg", None),
71
+ }
72
+
73
+
74
+ # ─── Value helpers ────────────────────────────────────────────────────────────
75
+
76
+ def _is_valid(v) -> bool:
77
+ """True if v is a real, non-zero, non-NaN number."""
78
+ if v is None:
79
+ return False
80
+ if isinstance(v, float) and (v != v): # NaN check
81
+ return False
82
+ # 0.0 is treated as missing/not-applicable for risk metrics
83
+ if v == 0:
84
+ return False
85
+ return True
86
+
87
+
88
+ def _is_valid_drawdown(v) -> bool:
89
+ """
90
+ For Maximum Drawdown specifically: 0.0 is a genuine data-quality gap
91
+ (overnight/liquid funds sometimes publish 0 when the real figure was never
92
+ fetched). Treat 0 as invalid so that only funds with a real (negative)
93
+ drawdown value compete in the ranking.
94
+ """
95
+ if v is None:
96
+ return False
97
+ if isinstance(v, float) and v != v: # NaN
98
+ return False
99
+ if v == 0:
100
+ return False # ← exact zero excluded; see drawdown_zero_fix() below
101
+ return True
102
+
103
+
104
+ # ─── Ranking helpers ──────────────────────────────────────────────────────────
105
+
106
+ def _top_n(fund: Fund, peers: List[Fund], metric: str, n: int = 10) -> bool:
107
+ """
108
+ Return True if fund is in the top-N (highest values) for metric.
109
+
110
+ Special case:
111
+ - For Information Ratio we allow a value of exactly 0.0 to participate
112
+ in ranking (Excel treats 0 as a valid value; only < 0 is "red").
113
+ """
114
+ fund_val = getattr(fund, metric, None)
115
+
116
+ def _valid_for_rank(v):
117
+ if metric == "info_ratio":
118
+ # Treat 0 as a real value; only None/NaN are invalid here.
119
+ if v is None:
120
+ return False
121
+ if isinstance(v, float) and (v != v):
122
+ return False
123
+ return True
124
+ return _is_valid(v)
125
+
126
+ if not _valid_for_rank(fund_val):
127
+ return False
128
+
129
+ valid = [getattr(f, metric, None) for f in peers
130
+ if _valid_for_rank(getattr(f, metric, None))]
131
+ if len(valid) < 2:
132
+ return False
133
+
134
+ # Match Excel's TOP 10 conditional formatting:
135
+ # "Top N items", with N capped at the number of valid funds.
136
+ effective_n = min(n, len(valid))
137
+ valid.sort(reverse=True)
138
+ return fund_val >= valid[effective_n - 1]
139
+
140
+
141
+ def _top_n_drawdown(fund: Fund, peers: List[Fund], n: int = 10) -> bool:
142
+ """
143
+ Special top-N for Maximum Drawdown.
144
+
145
+ "Closest to 0" = highest value among negatives.
146
+ -5% is better than -20%, so we still sort descending.
147
+ Only non-zero, non-None values participate (see _is_valid_drawdown).
148
+ Uses strict-N (no 50% fallback) so a single liquid fund with a real
149
+ drawdown doesn't accidentally qualify just because of category size.
150
+ """
151
+ fund_val = getattr(fund, "max_drawdown", None)
152
+ if not _is_valid_drawdown(fund_val):
153
+ return False
154
+
155
+ valid = [getattr(f, "max_drawdown", None) for f in peers
156
+ if _is_valid_drawdown(getattr(f, "max_drawdown", None))]
157
+ if not valid:
158
+ return False
159
+
160
+ effective_n = min(n, len(valid))
161
+ valid.sort(reverse=True) # -5 > -20 β†’ -5 is rank-1
162
+ return fund_val >= valid[effective_n - 1]
163
+
164
+
165
+ def _bottom_n(fund: Fund, peers: List[Fund], metric: str, n: int = 10) -> bool:
166
+ """Return True if fund is in the bottom-N (lowest values) for metric."""
167
+ fund_val = getattr(fund, metric, None)
168
+ if not _is_valid(fund_val):
169
+ return False
170
+
171
+ valid = [getattr(f, metric, None) for f in peers
172
+ if _is_valid(getattr(f, metric, None))]
173
+ if len(valid) < 2:
174
+ return False
175
+
176
+ # Match Excel's BOTTOM 10 conditional formatting:
177
+ # "Bottom N items", with N capped at the number of valid funds.
178
+ effective_n = min(n, len(valid))
179
+ valid.sort()
180
+ return fund_val <= valid[effective_n - 1]
181
+
182
+
183
+ def _category_avg(peers: List[Fund], metric: str) -> Optional[float]:
184
+ """Arithmetic mean of valid metric values across peers."""
185
+ vals = [getattr(f, metric, None) for f in peers
186
+ if _is_valid(getattr(f, metric, None))]
187
+ return sum(vals) / len(vals) if vals else None
188
+
189
+
190
+ def _light_red(fund: Fund, metric: str, cat_avg: Optional[float]) -> bool:
191
+ """Return True if the metric triggers a Light Red override for this fund."""
192
+ if metric not in DUAL_CONDITION_RULES:
193
+ return False
194
+ rule_type, threshold = DUAL_CONDITION_RULES[metric]
195
+ val = getattr(fund, metric, None)
196
+ if not _is_valid(val):
197
+ return False
198
+ if rule_type == "below_value":
199
+ return val < threshold
200
+ if rule_type == "below_category_avg":
201
+ return (cat_avg is not None) and (val < cat_avg)
202
+ return False
203
+
204
+
205
+ # ─── Drawdown zero-cell fix ───────────────────────────────────────────────────
206
+
207
+ def drawdown_zero_fix(
208
+ funds: List[Fund],
209
+ *,
210
+ verbose: bool = True,
211
+ ) -> int:
212
+ """
213
+ Detect funds whose max_drawdown is exactly 0 (data-quality gap) and
214
+ recompute it from live NAV history via the NAV engine.
215
+
216
+ Strategy
217
+ --------
218
+ 1. Collect every fund where max_drawdown == 0 AND the fund has a
219
+ scheme_code (stored in fund.name as a fallback lookup key via CSV).
220
+ In practice the scheme_code lives in the CSV row; the data_engine
221
+ should pass it through. We look for it on fund.fill_status
222
+ (which sometimes carries audit tags) or via a side-channel dict
223
+ passed in by the caller. Most robustly, callers should set
224
+ fund.fill_status = "DRAWDOWN_ZERO" before calling this function,
225
+ OR we scan all funds whose max_drawdown is 0.
226
+
227
+ 2. For each such fund, call compute_nav_metrics_for_scheme() requesting
228
+ only ["Maximum Drawdown"].
229
+
230
+ 3. If a real negative value comes back, write it to fund.max_drawdown.
231
+
232
+ Returns the count of cells successfully fixed.
233
+
234
+ NOTE: This function requires network access (mfapi.in + yfinance).
235
+ It is intentionally separated from compute_scores() so callers
236
+ can opt in only when enrichment is desired.
237
+ """
238
+ # Import here to avoid circular dependency at module level
239
+ try:
240
+ from src.nav_metrics_engine import NavEngineCache, compute_nav_metrics_for_scheme
241
+ except ImportError:
242
+ if verbose:
243
+ print("[drawdown_fix] nav_metrics_engine not available β€” skipping.")
244
+ return 0
245
+
246
+ # Build a name β†’ scheme_code map from fund.fill_status field
247
+ # (data_engine stores scheme codes in fill_status for audit; adjust if needed)
248
+ # Fallback: use the fund name itself as a best-effort search key.
249
+
250
+ DEBT_PREFIXES = ("debt", "liquid", "overnight", "money market", "gilt",
251
+ "fixed maturity", "interval", "fmp")
252
+
253
+ from datetime import datetime as _dt
254
+ _now = _dt.now()
255
+
256
+ def _fund_age_years(f) -> float | None:
257
+ ld = getattr(f, "_launch_date", None)
258
+ if not isinstance(ld, _dt):
259
+ return None
260
+ return (_now - ld).days / 365.25
261
+
262
+ # Import the set of funds already attempted by csv_enrichment NAV phase
263
+ try:
264
+ from src.csv_enrichment import _NAV_ATTEMPTED_FUNDS as _nav_attempted
265
+ except Exception:
266
+ _nav_attempted = set()
267
+
268
+ zero_funds = [
269
+ f for f in funds
270
+ if (
271
+ # Only target funds where drawdown is truly missing (0 or None)
272
+ (f.max_drawdown == 0 or f.max_drawdown is None)
273
+ # AND only equity/hybrid β€” debt funds have tiny/no drawdown, skip them
274
+ and not any(f.category.lower().startswith(pfx) for pfx in DEBT_PREFIXES)
275
+ # AND fund must be β‰₯3 years old β€” younger funds can't have 3Y NAV history
276
+ and (_fund_age_years(f) is None or _fund_age_years(f) >= 3.0)
277
+ # AND skip funds already attempted by csv_enrichment NAV phase β€”
278
+ # if enrichment couldn't fill MDD, a second pass won't either
279
+ and f.name not in _nav_attempted
280
+ )
281
+ ]
282
+
283
+ if not zero_funds:
284
+ if verbose:
285
+ print("[drawdown_fix] No zero/missing drawdown cells found.")
286
+ return 0
287
+
288
+ if verbose:
289
+ print(f"[drawdown_fix] Attempting to fix {len(zero_funds)} drawdown cells …")
290
+
291
+ from concurrent.futures import ThreadPoolExecutor, as_completed as _as_completed
292
+ import threading as _threading
293
+
294
+ # Bulk-preload cache before parallel workers start (2 SQL queries instead of N)
295
+ try:
296
+ from src.nav_metrics_engine import _bulk_preload_cache, resolve_benchmark_ticker
297
+ _scheme_codes = [getattr(f, "_scheme_code", None) or "" for f in zero_funds]
298
+ _bench_tickers = [resolve_benchmark_ticker(getattr(f, "benchmark", "") or "") for f in zero_funds]
299
+ _bulk_preload_cache(_scheme_codes, _bench_tickers)
300
+ except Exception:
301
+ pass # graceful degradation β€” workers will fall back to per-query
302
+
303
+ cache = NavEngineCache()
304
+ fixed = 0
305
+ _lock = _threading.Lock()
306
+
307
+ with_code = [
308
+ (f, getattr(f, "_scheme_code", None) or "", getattr(f, "benchmark", "") or "")
309
+ for f in zero_funds
310
+ if (getattr(f, "_scheme_code", None) or "").strip()
311
+ ]
312
+ no_code = [f for f in zero_funds if not (getattr(f, "_scheme_code", None) or "").strip()]
313
+
314
+ if verbose:
315
+ for f in no_code:
316
+ print(f" SKIP {f.name[:55]} β€” no scheme code available")
317
+
318
+ def _fix_one(args):
319
+ fund, scheme_code, benchmark = args
320
+ metrics, skip = compute_nav_metrics_for_scheme(
321
+ scheme_code=scheme_code,
322
+ benchmark_type=benchmark,
323
+ needed_metrics=["Maximum Drawdown"],
324
+ cache=cache,
325
+ )
326
+ mdd = metrics.get("Maximum Drawdown")
327
+ reason = skip.get("Maximum Drawdown", "unknown")
328
+ return fund, mdd, reason
329
+
330
+ with ThreadPoolExecutor(max_workers=12) as executor:
331
+ futures = {executor.submit(_fix_one, item): item for item in with_code}
332
+ for fut in _as_completed(futures):
333
+ try:
334
+ fund, mdd, reason = fut.result()
335
+ except Exception as e:
336
+ continue
337
+ if mdd is not None and mdd != 0:
338
+ with _lock:
339
+ fund.max_drawdown = mdd
340
+ fixed += 1
341
+ if verbose:
342
+ print(f" FIXED {fund.name[:55]} β†’ MDD = {mdd:.3f}%")
343
+ else:
344
+ if verbose:
345
+ print(f" MISS {fund.name[:55]} β€” {reason}")
346
+
347
+ if verbose:
348
+ print(f"[drawdown_fix] Done. Fixed {fixed}/{len(zero_funds)} cells.")
349
+
350
+ return fixed
351
+
352
+
353
+ # ─── Main scoring engine ──────────────────────────────────────────────────────
354
+
355
+ def compute_scores(funds: List[Fund]) -> List[Fund]:
356
+ """
357
+ Score and rank all funds within their categories.
358
+
359
+ Algorithm
360
+ ---------
361
+ For every metric that carries a weight:
362
+ 1. Check if the fund is in Top-N or Bottom-N (as appropriate) within
363
+ its category peer group β†’ "Light Green"
364
+ 2. If Light Green AND a dual-condition rule fires β†’ "Light Red"
365
+ override: weight contribution = 0
366
+ 3. Otherwise if Light Green and NOT Light Red β†’ add weight
367
+
368
+ fund.score is capped at 10.0 (model scale).
369
+
370
+ Also sets:
371
+ fund.rank_in_category – 1 = best within category
372
+ fund.is_top_quartile – True for top ⌈N/4βŒ‰ funds
373
+
374
+ Returns the same list (mutated in-place) for convenience.
375
+ """
376
+ # Group by category
377
+ categories: Dict[str, List[Fund]] = {}
378
+ for fund in funds:
379
+ categories.setdefault(fund.category, []).append(fund)
380
+
381
+ for cat_name, cat_funds in categories.items():
382
+
383
+ # Pre-compute category averages for CAGR dual-condition rules
384
+ cat_averages = {
385
+ metric: _category_avg(cat_funds, metric)
386
+ for metric in ("cagr_3y", "cagr_5y", "cagr_10y")
387
+ }
388
+
389
+ for fund in cat_funds:
390
+ score = 0.0
391
+
392
+ for metric, weight in WEIGHTS.items():
393
+ is_green = False
394
+
395
+ # ── Green check ──────────────────────────────────────────
396
+ if metric == "max_drawdown":
397
+ is_green = _top_n_drawdown(fund, cat_funds)
398
+ elif metric in TOP_10_METRICS:
399
+ is_green = _top_n(fund, cat_funds, metric)
400
+ elif metric in BOTTOM_10_METRICS:
401
+ is_green = _bottom_n(fund, cat_funds, metric)
402
+
403
+ # ── Light Red override ───────────────────────────────────
404
+ if is_green and metric in DUAL_CONDITION_RULES:
405
+ cat_avg = cat_averages.get(metric)
406
+ if _light_red(fund, metric, cat_avg):
407
+ is_green = False # zeroed by override
408
+
409
+ if is_green:
410
+ score += weight
411
+
412
+ fund.score = round(min(score, 10.0), 3)
413
+
414
+ # ── Rank within category ─────────────────────────────────────────
415
+ sorted_funds = sorted(
416
+ cat_funds,
417
+ key=lambda f: (-(f.score or 0), (f.name or "").lower(), getattr(f, "order", 0)),
418
+ )
419
+ top_quartile_cutoff = max(1, math.ceil(len(sorted_funds) / 4))
420
+
421
+ for rank, fund in enumerate(sorted_funds, start=1):
422
+ fund.rank_in_category = rank
423
+ fund.is_top_quartile = (rank <= top_quartile_cutoff)
424
+
425
+ return funds