"""
analysis.py — NLG报告、行业影响、评估指标、消融实验
====================================================
从 v2_championship.py 拆出的分析与评估模块。
"""

import pandas as pd
import numpy as np
from sklearn.linear_model import QuantileRegressor
from sklearn.preprocessing import StandardScaler

from config import FACTOR_GROUPS, INDUSTRIES, INDUSTRY_ZH


# ═══════════════════════════════════════════════════════════
# INDUSTRY IMPACT RULE ENGINE
# ═══════════════════════════════════════════════════════════

def apply_industry_rules(rec):
    """基于风险等级、偏置、因子等推断各行业的风险和建议。"""
    risk_level = rec.get('risk_level', 'Low')
    risk_bias = rec.get('risk_bias', 'Balanced')
    vol_ratio = rec.get('vol_ratio', 1.0)
    top_fac = rec.get('top_factor', 'Unknown')

    is_high = risk_level == 'High'
    is_medium = risk_level == 'Medium'
    is_upward = risk_bias == 'Upward'
    is_downward = risk_bias == 'Downward'
    wide = vol_ratio > 1.3

    rules = {}
    for industry in INDUSTRIES:
        ind_risk = 'Low'
        ind_action = 'Routine monitoring'

        if industry == 'Aviation':
            if is_high and is_upward:
                ind_risk, ind_action = 'High', 'Increase hedging coverage; review fuel cost budget'
            elif is_high:
                ind_risk, ind_action = 'High', 'Elevated volatility; prepare contingency liquidity'
            elif is_upward:
                ind_risk, ind_action = 'Medium-High', 'Monitor fuel cost exposure; consider forward contracts'
            elif is_medium:
                ind_risk, ind_action = 'Medium', 'Review quarterly fuel hedging strategy'
            if top_fac == 'Demand': ind_action += '; demand-driven → cost pressure may persist'
            elif top_fac == 'Supply': ind_action += '; supply-driven → watch OPEC decisions'
            elif top_fac == 'Risk_Geo': ind_action += '; geopolitical risk → event monitoring'

        elif industry == 'Logistics':
            if is_high and is_upward:
                ind_risk, ind_action = 'Medium-High', 'Review transport cost pass-through; working capital buffer'
            elif is_high:
                ind_risk, ind_action = 'Medium', 'Monitor diesel/freight cost exposure'
            elif is_medium and is_upward:
                ind_risk, ind_action = 'Medium', 'Review fuel surcharge mechanisms'

        elif industry == 'Chemicals':
            if is_upward and top_fac == 'Supply':
                ind_risk, ind_action = 'High', 'Feedstock cost pressure; margin compression likely'
            elif is_high:
                ind_risk, ind_action = 'Medium-High', 'Monitor naphtha/ethylene spread; review procurement'
            elif wide:
                ind_risk, ind_action = 'Medium', 'Profit uncertainty elevated; scenario planning advised'

        elif industry == 'Manufacturing':
            if is_high and is_upward:
                ind_risk, ind_action = 'High', 'Energy cost surge risk; review energy hedging'
            elif is_high:
                ind_risk, ind_action = 'Medium', 'Elevated input cost volatility'
            elif is_medium:
                ind_risk, ind_action = 'Medium', 'Monitor energy procurement costs'

        elif industry == 'Upstream_OG':
            if is_downward and is_high:
                ind_risk, ind_action = 'High', 'Revenue decline risk; review covenant compliance & liquidity'
            elif is_downward:
                ind_risk, ind_action = 'Medium-High', 'Downside tail expanding; monitor cash flow coverage'
            elif is_upward:
                ind_risk, ind_action = 'Low', 'Revenue tailwind; capex commitment review advised'
            else:
                ind_risk, ind_action = 'Low-Medium', 'Balanced outlook'

        rules[f'{industry}_risk'] = ind_risk
        rules[f'{industry}_action'] = ind_action
    return rules


# ═══════════════════════════════════════════════════════════
# REGIME ECONOMIC NARRATIVES
# ═══════════════════════════════════════════════════════════

REGIME_NARRATIVES = {
    '2008 金融危机': {
        'history': '2008年全球金融危机期间，油价从$147暴跌至$32，降幅78%。需求端崩塌是主因——全球GDP收缩、贸易锐减、制造业PMI普遍跌破40。',
        'implication': '需求崩塌格局下，下游成本端企业短期受益于低油价，但总需求萎缩拖累整体营收。上游企业面临最大冲击。',
        'hedge_advice': '上游企业应立即锁定远期销售价格；下游企业可逢低建仓，锁定低价原料。',
    },
    '2014 页岩油冲击': {
        'history': '2014-16年美国页岩油产量爆发（+400万桶/日），叠加OPEC拒绝减产，油价从$110跌至$26。供给过剩主导。',
        'implication': '供给过剩格局持续时间长（18个月以上），下游企业成本优势可持续；上游企业需要重组债务、缩减资本开支。',
        'hedge_advice': '重点关注远期曲线结构（contango加深），利用期货锁价窗口。下游企业延长采购合约期限。',
    },
    '2020 COVID': {
        'history': '2020年COVID-19导致全球需求暴减2000万桶/日，WTI期货历史性跌至负值。需求冲击+仓储危机双重打击。',
        'implication': '极端需求冲击下，航空业客运量降90%+，物流链中断。但复苏速度可能超预期——V型反弹是历史常态。',
        'hedge_advice': '短期：保持现金流弹性，避免过度套保。中期：关注OPEC+协调减产信号，逢低建立多头头寸。',
    },
    '2022 俄乌冲突': {
        'history': '2022年俄乌冲突导致俄罗斯原油出口受制裁，供给缺口+地缘溢价推升布伦特至$130+。地缘政治+供给双重冲击。',
        'implication': '地缘驱动的价格飙升通常突然但短暂（3-6个月），随后制裁适应和替代供给逐步消化溢价。',
        'hedge_advice': '事件驱动行情中，期权策略优于期货——买入看涨期权锁定上限成本，保留价格回落的收益空间。',
    },
    '2023 OPEC减产': {
        'history': '2023年OPEC+主动减产200万桶/日，托底油价在$70-90区间。供给管理型市场，价格波动率较低。',
        'implication': 'OPEC减产格局下，价格区间可预测性较高，但下行风险来自减产执行率下滑和非OPEC增产。',
        'hedge_advice': '低波动环境适合使用零成本领（collar）策略，锁定窄价格带。',
    },
    '常态/低波动': {
        'history': '油价处于常态波动区间，无明显单一因子主导。市场处于供需基本平衡状态。',
        'implication': '常态下关注结构性变化信号——OPEC会议决策、美国钻井数趋势、中国PMI走向。',
        'hedge_advice': '常态下对冲比例可适当降低（25-40%），使用低成本期货锁价即可。',
    },
}


# ═══════════════════════════════════════════════════════════
# NLG REPORT GENERATION (ENHANCED)
# ═══════════════════════════════════════════════════════════

def generate_nlg_report(row):
    """生成单月深度风险研判报告，含经济学叙事和对冲建议。"""
    date = pd.Timestamp(row['test_date']).strftime('%Y年%m月')
    rl = row['risk_level']
    rb = row['risk_bias']
    top = row.get('top_factor', 'Unknown')
    q10 = row['pred_q10_1m'] * 100
    q50 = row['pred_q50_1m'] * 100
    q90 = row['pred_q90_1m'] * 100
    vol = row['pred_vol'] * 100

    rl_zh = {'Low': '低', 'Medium': '中等', 'High': '高'}.get(rl, rl)
    rb_zh = {'Upward': '偏上行', 'Downward': '偏下行', 'Balanced': '均衡'}.get(rb, rb)
    factor_zh = {
        'Price': '价格联动', 'Supply': '供给端', 'Demand': '需求端',
        'Risk_Geo': '地缘政治/风险', 'Technical': '技术面',
    }.get(top, top)

    # ── Part 1: 核心判断 ──
    summary = (
        f"【{date}油价风险研判】\n"
        f"■ 核心判断：风险等级{rl_zh}，方向{rb_zh}，由{factor_zh}因子主导。\n"
        f"■ 1M预测区间：[{q10:+.1f}%, {q90:+.1f}%]，中枢{q50:+.1f}%，波动率{vol:.1f}%。\n"
    )

    # 3M
    if pd.notna(row.get('pred_q10_3m')):
        summary += f"■ 3M预测区间：[{row['pred_q10_3m']*100:+.1f}%, {row['pred_q90_3m']*100:+.1f}%]，中枢{row['pred_q50_3m']*100:+.1f}%。\n"

    # CQR
    cqr_lo = row.get('cqr_q10_1m', None)
    if cqr_lo is not None and pd.notna(cqr_lo):
        summary += f"■ CQR校准区间：[{cqr_lo*100:+.1f}%, {row['cqr_q90_1m']*100:+.1f}%]（分布自由覆盖保证）。\n"

    # ── Part 2: Regime 经济学叙事 ──
    regime = row.get('regime_match', '')
    regime_sim = row.get('regime_similarity', 0)
    if regime and regime != 'Unknown':
        summary += f"\n▶ 格局识别：当前最接近「{regime}」（相似度{regime_sim*100:.0f}%）\n"
        narr = REGIME_NARRATIVES.get(regime, {})
        if narr:
            summary += f"  历史参照：{narr['history']}\n"
            summary += f"  当前启示：{narr['implication']}\n"
            summary += f"  对冲建议：{narr['hedge_advice']}\n"

    # ── Part 3: 行业影响 ──
    high_risk = []
    med_risk = []
    for ind in INDUSTRIES:
        risk = str(row.get(f'{ind}_risk', 'Low'))
        if 'High' in risk:
            high_risk.append(INDUSTRY_ZH.get(ind, ind))
        elif 'Medium' in risk:
            med_risk.append(INDUSTRY_ZH.get(ind, ind))
    if high_risk:
        summary += f"\n▶ 高风险行业：{'、'.join(high_risk)}——建议提升套保覆盖率至60-80%。\n"
    if med_risk:
        summary += f"▶ 中风险行业：{'、'.join(med_risk)}——建议维持25-50%套保覆盖。\n"
    if not high_risk and not med_risk:
        summary += f"\n▶ 各行业风险均处于可控水平，建议维持常规套保比例。\n"

    # ── Part 4: 压力测试 ──
    base = row.get('scenario_base', 0) * 100
    vix = row.get('scenario_vix_shock', 0) * 100
    supply = row.get('scenario_supply_cut', 0) * 100
    demand = row.get('scenario_demand_crash', 0) * 100
    worst = min(supply, demand)
    summary += (
        f"\n▶ 压力测试：基准{base:+.1f}% | VIX翻倍{vix:+.1f}% | "
        f"供给中断{supply:+.1f}% | 需求崩塌{demand:+.1f}%\n"
        f"  最大下行风险：{worst:+.1f}%，建议预留相应流动性缓冲。\n"
    )

    return summary


def generate_all_reports(results):
    """为所有月份生成 NLG 报告。"""
    reports = {}
    for _, row in results.iterrows():
        dt = pd.Timestamp(row['test_date']).strftime('%Y-%m')
        reports[dt] = generate_nlg_report(row)
    return reports


# ═══════════════════════════════════════════════════════════
# EVALUATION METRICS
# ═══════════════════════════════════════════════════════════

def evaluate_results(results):
    """计算全面的评估指标并打印。"""
    # 只评估有实际值的月份（排除 live forecast）
    eval_mask = results['actual_ret_1m'].notna()
    results = results[eval_mask].copy()
    ar = results['actual_ret_1m'].values
    av = results['actual_vol'].values
    n = len(results)

    print(f"\n{'='*65}")
    print("V2 CHAMPIONSHIP — EVALUATION")
    print("=" * 65)

    # 1M Interval
    print(f"\n--- 1M INTERVAL ---")
    models = [('QR (vol-adapt)', 'pred'), ('LightGBM', 'lgb')]
    if 'cqr_q10_1m' in results.columns:
        models.append(('Conformal QR', 'cqr'))
    for model_name, prefix in models:
        q10 = results[f'{prefix}_q10_1m'].values
        q90 = results[f'{prefix}_q90_1m'].values
        cov = ((ar >= q10) & (ar <= q90)).mean()
        wis = ((q90-q10) + (2/0.2)*np.maximum(q10-ar, 0) + (2/0.2)*np.maximum(ar-q90, 0)).mean()
        vm = np.median(av)
        hi = av > vm
        cov_hi = ((ar[hi] >= q10[hi]) & (ar[hi] <= q90[hi])).mean() if hi.sum() > 0 else 0
        print(f"  {model_name:<18} Cov={cov:.1%} HiCov={cov_hi:.1%} WIS={wis:.4f}")

    nq10 = np.full(n, np.quantile(ar, 0.10))
    nq90 = np.full(n, np.quantile(ar, 0.90))
    naive_wis = ((nq90-nq10) + (2/0.2)*np.maximum(nq10-ar, 0) + (2/0.2)*np.maximum(ar-nq90, 0)).mean()
    print(f"  {'Naive':<18} WIS={naive_wis:.4f}")

    # 3M
    print(f"\n--- 3M INTERVAL ---")
    m3 = results['actual_ret_3m'].notna()
    if m3.sum() > 10:
        ar3 = results.loc[m3, 'actual_ret_3m'].values
        q10_3 = results.loc[m3, 'pred_q10_3m'].values
        q90_3 = results.loc[m3, 'pred_q90_3m'].values
        cov3 = ((ar3 >= q10_3) & (ar3 <= q90_3)).mean()
        wis3 = ((q90_3-q10_3) + (2/0.2)*np.maximum(q10_3-ar3, 0) + (2/0.2)*np.maximum(ar3-q90_3, 0)).mean()
        print(f"  QR 3M (vol-adapt)  Cov={cov3:.1%} WIS={wis3:.4f} n={m3.sum()}")

    # Vol
    print(f"\n--- VOL SCORE ---")
    pv = results['pred_vol'].values
    bl = results['baseline_ewma'].values
    for nm, prd in [('V2 BL+Resid', pv), ('EWMA', bl)]:
        from sklearn.metrics import mean_squared_error
        rmse = np.sqrt(mean_squared_error(av, prd))
        corr = np.corrcoef(av, prd)[0, 1]
        print(f"  {nm:<18} RMSE={rmse:.4f} corr={corr:+.3f}")

    # Risk levels
    print(f"\n--- RISK LEVELS ---")
    for lvl in ['Low', 'Medium', 'High']:
        mask = results['risk_level'] == lvl
        if mask.sum() > 0:
            print(f"  {lvl:<8}: vol={av[mask].mean():.4f} n={mask.sum()}")

    # Factor frequency
    print(f"\n--- FACTOR FREQ ---")
    if 'top_factor' in results.columns:
        for fac, cnt in results['top_factor'].value_counts().items():
            print(f"  {fac:<12}: {cnt} ({cnt/n:.1%})")

    # SHAP
    print(f"\n--- SHAP (avg) ---")
    for g in FACTOR_GROUPS:
        col = f'shap_{g}'
        if col in results.columns:
            print(f"  {g:<12}: {results[col].abs().mean():.4f}")

    # Scenario
    print(f"\n--- SCENARIO (latest) ---")
    lat = results.iloc[-1]
    print(f"  Base:         {lat['scenario_base']*100:+.1f}%")
    print(f"  VIX shock:    {lat['scenario_vix_shock']*100:+.1f}%")
    print(f"  Supply cut:   {lat['scenario_supply_cut']*100:+.1f}%")
    print(f"  Demand crash: {lat['scenario_demand_crash']*100:+.1f}%")

    # NLG sample
    print(f"\n--- NLG REPORT (latest) ---")
    print(generate_nlg_report(results.iloc[-1]))


# ═══════════════════════════════════════════════════════════
# ABLATION EXPERIMENTS
# ═══════════════════════════════════════════════════════════

def _run_qr_eval(panel, feat_list, train_window=120):
    """内部辅助：对给定特征子集跑 walk-forward QR 并返回 (cov, wis, n)。"""
    hits, totals, wis_list = 0, 0, []
    for i in range(train_window, len(panel) - 1):
        train_df = panel.iloc[max(0, i - train_window):i]
        test_df = panel.iloc[i:i + 1]
        avail = [f for f in feat_list if f in train_df.columns and train_df[f].notna().mean() > 0.8]
        if len(avail) < 2:
            continue
        X_tr = train_df[avail].fillna(train_df[avail].median())
        X_te = test_df[avail].fillna(train_df[avail].median())
        sc = StandardScaler()
        X_tr_s = sc.fit_transform(X_tr)
        X_te_s = sc.transform(X_te)
        y = train_df['target_ret_1m'].dropna()
        mask = y.index.isin(X_tr.index)
        y = y[mask]
        X_tr_s = X_tr_s[:len(y)]
        actual = panel['target_ret_1m'].iloc[i]
        if np.isnan(actual):
            continue
        try:
            qr10 = QuantileRegressor(quantile=0.10, alpha=0.1, solver='highs')
            qr90 = QuantileRegressor(quantile=0.90, alpha=0.1, solver='highs')
            qr10.fit(X_tr_s, y)
            qr90.fit(X_tr_s, y)
            p10, p90 = qr10.predict(X_te_s)[0], qr90.predict(X_te_s)[0]
            if p10 > p90: p10, p90 = p90, p10
            hit = 1 if p10 <= actual <= p90 else 0
            hits += hit
            totals += 1
            wis_list.append((p90-p10) + (2/0.2)*max(p10-actual, 0) + (2/0.2)*max(actual-p90, 0))
        except:
            continue
    cov = hits / totals if totals > 0 else 0
    wis = np.mean(wis_list) if wis_list else 999
    return cov, wis, totals


def run_ablation(panel, features):
    """运行消融实验：训练窗口 + 因子组 leave-one-out。"""
    print(f"\n--- ABLATION EXPERIMENTS ---")
    ablation_results = []

    # ── Part 1: Window ablation ──
    print("  [窗口消融]")
    for w in [84, 120, 180]:
        cov, wis, n_ab = _run_qr_eval(panel, features, train_window=w)
        ablation_results.append({
            'type': 'window', 'param': w, 'param_label': f'{w}月',
            'cov': round(cov, 3), 'wis': round(wis, 4), 'n': n_ab
        })
        print(f"    Window={w:>3}: Cov={cov:.1%} WIS={wis:.4f} n={n_ab}")

    # ── Part 2: Factor-group leave-one-out ──
    print("  [因子组消融 — Leave-One-Out]")
    # Baseline: all factors
    base_cov, base_wis, base_n = _run_qr_eval(panel, features)
    ablation_results.append({
        'type': 'factor_group', 'param': 'ALL', 'param_label': '全部因子',
        'cov': round(base_cov, 3), 'wis': round(base_wis, 4), 'n': base_n
    })
    print(f"    ALL (baseline): Cov={base_cov:.1%} WIS={base_wis:.4f}")

    group_zh = {'Price': '价格联动', 'Supply': '供给端', 'Demand': '需求端',
                'Risk_Geo': '地缘/风险', 'Technical': '技术面'}

    for group, members in FACTOR_GROUPS.items():
        # Remove this group's features
        reduced = [f for f in features if f not in members]
        if len(reduced) < 2:
            continue
        cov, wis, n_ab = _run_qr_eval(panel, reduced)
        delta_cov = cov - base_cov
        delta_wis = wis - base_wis
        ablation_results.append({
            'type': 'factor_group', 'param': group,
            'param_label': f'去除{group_zh.get(group, group)}',
            'cov': round(cov, 3), 'wis': round(wis, 4), 'n': n_ab,
            'delta_cov': round(delta_cov, 3), 'delta_wis': round(delta_wis, 4),
        })
        print(f"    -{group:<12}: Cov={cov:.1%} (Δ{delta_cov:+.1%}) "
              f"WIS={wis:.4f} (Δ{delta_wis:+.4f})")

    return ablation_results