""" analysis.py — NLG报告、行业影响、评估指标、消融实验 ==================================================== 从 v2_championship.py 拆出的分析与评估模块。 """ import pandas as pd import numpy as np from sklearn.linear_model import QuantileRegressor from sklearn.preprocessing import StandardScaler from config import FACTOR_GROUPS, INDUSTRIES, INDUSTRY_ZH # ═══════════════════════════════════════════════════════════ # INDUSTRY IMPACT RULE ENGINE # ═══════════════════════════════════════════════════════════ def apply_industry_rules(rec): """基于风险等级、偏置、因子等推断各行业的风险和建议。""" risk_level = rec.get('risk_level', 'Low') risk_bias = rec.get('risk_bias', 'Balanced') vol_ratio = rec.get('vol_ratio', 1.0) top_fac = rec.get('top_factor', 'Unknown') is_high = risk_level == 'High' is_medium = risk_level == 'Medium' is_upward = risk_bias == 'Upward' is_downward = risk_bias == 'Downward' wide = vol_ratio > 1.3 rules = {} for industry in INDUSTRIES: ind_risk = 'Low' ind_action = 'Routine monitoring' if industry == 'Aviation': if is_high and is_upward: ind_risk, ind_action = 'High', 'Increase hedging coverage; review fuel cost budget' elif is_high: ind_risk, ind_action = 'High', 'Elevated volatility; prepare contingency liquidity' elif is_upward: ind_risk, ind_action = 'Medium-High', 'Monitor fuel cost exposure; consider forward contracts' elif is_medium: ind_risk, ind_action = 'Medium', 'Review quarterly fuel hedging strategy' if top_fac == 'Demand': ind_action += '; demand-driven → cost pressure may persist' elif top_fac == 'Supply': ind_action += '; supply-driven → watch OPEC decisions' elif top_fac == 'Risk_Geo': ind_action += '; geopolitical risk → event monitoring' elif industry == 'Logistics': if is_high and is_upward: ind_risk, ind_action = 'Medium-High', 'Review transport cost pass-through; working capital buffer' elif is_high: ind_risk, ind_action = 'Medium', 'Monitor diesel/freight cost exposure' elif is_medium and is_upward: ind_risk, ind_action = 'Medium', 'Review fuel surcharge mechanisms' elif industry == 'Chemicals': if is_upward and top_fac == 'Supply': ind_risk, ind_action = 'High', 'Feedstock cost pressure; margin compression likely' elif is_high: ind_risk, ind_action = 'Medium-High', 'Monitor naphtha/ethylene spread; review procurement' elif wide: ind_risk, ind_action = 'Medium', 'Profit uncertainty elevated; scenario planning advised' elif industry == 'Manufacturing': if is_high and is_upward: ind_risk, ind_action = 'High', 'Energy cost surge risk; review energy hedging' elif is_high: ind_risk, ind_action = 'Medium', 'Elevated input cost volatility' elif is_medium: ind_risk, ind_action = 'Medium', 'Monitor energy procurement costs' elif industry == 'Upstream_OG': if is_downward and is_high: ind_risk, ind_action = 'High', 'Revenue decline risk; review covenant compliance & liquidity' elif is_downward: ind_risk, ind_action = 'Medium-High', 'Downside tail expanding; monitor cash flow coverage' elif is_upward: ind_risk, ind_action = 'Low', 'Revenue tailwind; capex commitment review advised' else: ind_risk, ind_action = 'Low-Medium', 'Balanced outlook' rules[f'{industry}_risk'] = ind_risk rules[f'{industry}_action'] = ind_action return rules # ═══════════════════════════════════════════════════════════ # REGIME ECONOMIC NARRATIVES # ═══════════════════════════════════════════════════════════ REGIME_NARRATIVES = { '2008 金融危机': { 'history': '2008年全球金融危机期间,油价从$147暴跌至$32,降幅78%。需求端崩塌是主因——全球GDP收缩、贸易锐减、制造业PMI普遍跌破40。', 'implication': '需求崩塌格局下,下游成本端企业短期受益于低油价,但总需求萎缩拖累整体营收。上游企业面临最大冲击。', 'hedge_advice': '上游企业应立即锁定远期销售价格;下游企业可逢低建仓,锁定低价原料。', }, '2014 页岩油冲击': { 'history': '2014-16年美国页岩油产量爆发(+400万桶/日),叠加OPEC拒绝减产,油价从$110跌至$26。供给过剩主导。', 'implication': '供给过剩格局持续时间长(18个月以上),下游企业成本优势可持续;上游企业需要重组债务、缩减资本开支。', 'hedge_advice': '重点关注远期曲线结构(contango加深),利用期货锁价窗口。下游企业延长采购合约期限。', }, '2020 COVID': { 'history': '2020年COVID-19导致全球需求暴减2000万桶/日,WTI期货历史性跌至负值。需求冲击+仓储危机双重打击。', 'implication': '极端需求冲击下,航空业客运量降90%+,物流链中断。但复苏速度可能超预期——V型反弹是历史常态。', 'hedge_advice': '短期:保持现金流弹性,避免过度套保。中期:关注OPEC+协调减产信号,逢低建立多头头寸。', }, '2022 俄乌冲突': { 'history': '2022年俄乌冲突导致俄罗斯原油出口受制裁,供给缺口+地缘溢价推升布伦特至$130+。地缘政治+供给双重冲击。', 'implication': '地缘驱动的价格飙升通常突然但短暂(3-6个月),随后制裁适应和替代供给逐步消化溢价。', 'hedge_advice': '事件驱动行情中,期权策略优于期货——买入看涨期权锁定上限成本,保留价格回落的收益空间。', }, '2023 OPEC减产': { 'history': '2023年OPEC+主动减产200万桶/日,托底油价在$70-90区间。供给管理型市场,价格波动率较低。', 'implication': 'OPEC减产格局下,价格区间可预测性较高,但下行风险来自减产执行率下滑和非OPEC增产。', 'hedge_advice': '低波动环境适合使用零成本领(collar)策略,锁定窄价格带。', }, '常态/低波动': { 'history': '油价处于常态波动区间,无明显单一因子主导。市场处于供需基本平衡状态。', 'implication': '常态下关注结构性变化信号——OPEC会议决策、美国钻井数趋势、中国PMI走向。', 'hedge_advice': '常态下对冲比例可适当降低(25-40%),使用低成本期货锁价即可。', }, } # ═══════════════════════════════════════════════════════════ # NLG REPORT GENERATION (ENHANCED) # ═══════════════════════════════════════════════════════════ def generate_nlg_report(row): """生成单月深度风险研判报告,含经济学叙事和对冲建议。""" date = pd.Timestamp(row['test_date']).strftime('%Y年%m月') rl = row['risk_level'] rb = row['risk_bias'] top = row.get('top_factor', 'Unknown') q10 = row['pred_q10_1m'] * 100 q50 = row['pred_q50_1m'] * 100 q90 = row['pred_q90_1m'] * 100 vol = row['pred_vol'] * 100 rl_zh = {'Low': '低', 'Medium': '中等', 'High': '高'}.get(rl, rl) rb_zh = {'Upward': '偏上行', 'Downward': '偏下行', 'Balanced': '均衡'}.get(rb, rb) factor_zh = { 'Price': '价格联动', 'Supply': '供给端', 'Demand': '需求端', 'Risk_Geo': '地缘政治/风险', 'Technical': '技术面', }.get(top, top) # ── Part 1: 核心判断 ── summary = ( f"【{date}油价风险研判】\n" f"■ 核心判断:风险等级{rl_zh},方向{rb_zh},由{factor_zh}因子主导。\n" f"■ 1M预测区间:[{q10:+.1f}%, {q90:+.1f}%],中枢{q50:+.1f}%,波动率{vol:.1f}%。\n" ) # 3M if pd.notna(row.get('pred_q10_3m')): summary += f"■ 3M预测区间:[{row['pred_q10_3m']*100:+.1f}%, {row['pred_q90_3m']*100:+.1f}%],中枢{row['pred_q50_3m']*100:+.1f}%。\n" # CQR cqr_lo = row.get('cqr_q10_1m', None) if cqr_lo is not None and pd.notna(cqr_lo): summary += f"■ CQR校准区间:[{cqr_lo*100:+.1f}%, {row['cqr_q90_1m']*100:+.1f}%](分布自由覆盖保证)。\n" # ── Part 2: Regime 经济学叙事 ── regime = row.get('regime_match', '') regime_sim = row.get('regime_similarity', 0) if regime and regime != 'Unknown': summary += f"\n▶ 格局识别:当前最接近「{regime}」(相似度{regime_sim*100:.0f}%)\n" narr = REGIME_NARRATIVES.get(regime, {}) if narr: summary += f" 历史参照:{narr['history']}\n" summary += f" 当前启示:{narr['implication']}\n" summary += f" 对冲建议:{narr['hedge_advice']}\n" # ── Part 3: 行业影响 ── high_risk = [] med_risk = [] for ind in INDUSTRIES: risk = str(row.get(f'{ind}_risk', 'Low')) if 'High' in risk: high_risk.append(INDUSTRY_ZH.get(ind, ind)) elif 'Medium' in risk: med_risk.append(INDUSTRY_ZH.get(ind, ind)) if high_risk: summary += f"\n▶ 高风险行业:{'、'.join(high_risk)}——建议提升套保覆盖率至60-80%。\n" if med_risk: summary += f"▶ 中风险行业:{'、'.join(med_risk)}——建议维持25-50%套保覆盖。\n" if not high_risk and not med_risk: summary += f"\n▶ 各行业风险均处于可控水平,建议维持常规套保比例。\n" # ── Part 4: 压力测试 ── base = row.get('scenario_base', 0) * 100 vix = row.get('scenario_vix_shock', 0) * 100 supply = row.get('scenario_supply_cut', 0) * 100 demand = row.get('scenario_demand_crash', 0) * 100 worst = min(supply, demand) summary += ( f"\n▶ 压力测试:基准{base:+.1f}% | VIX翻倍{vix:+.1f}% | " f"供给中断{supply:+.1f}% | 需求崩塌{demand:+.1f}%\n" f" 最大下行风险:{worst:+.1f}%,建议预留相应流动性缓冲。\n" ) return summary def generate_all_reports(results): """为所有月份生成 NLG 报告。""" reports = {} for _, row in results.iterrows(): dt = pd.Timestamp(row['test_date']).strftime('%Y-%m') reports[dt] = generate_nlg_report(row) return reports # ═══════════════════════════════════════════════════════════ # EVALUATION METRICS # ═══════════════════════════════════════════════════════════ def evaluate_results(results): """计算全面的评估指标并打印。""" # 只评估有实际值的月份(排除 live forecast) eval_mask = results['actual_ret_1m'].notna() results = results[eval_mask].copy() ar = results['actual_ret_1m'].values av = results['actual_vol'].values n = len(results) print(f"\n{'='*65}") print("V2 CHAMPIONSHIP — EVALUATION") print("=" * 65) # 1M Interval print(f"\n--- 1M INTERVAL ---") models = [('QR (vol-adapt)', 'pred'), ('LightGBM', 'lgb')] if 'cqr_q10_1m' in results.columns: models.append(('Conformal QR', 'cqr')) for model_name, prefix in models: q10 = results[f'{prefix}_q10_1m'].values q90 = results[f'{prefix}_q90_1m'].values cov = ((ar >= q10) & (ar <= q90)).mean() wis = ((q90-q10) + (2/0.2)*np.maximum(q10-ar, 0) + (2/0.2)*np.maximum(ar-q90, 0)).mean() vm = np.median(av) hi = av > vm cov_hi = ((ar[hi] >= q10[hi]) & (ar[hi] <= q90[hi])).mean() if hi.sum() > 0 else 0 print(f" {model_name:<18} Cov={cov:.1%} HiCov={cov_hi:.1%} WIS={wis:.4f}") nq10 = np.full(n, np.quantile(ar, 0.10)) nq90 = np.full(n, np.quantile(ar, 0.90)) naive_wis = ((nq90-nq10) + (2/0.2)*np.maximum(nq10-ar, 0) + (2/0.2)*np.maximum(ar-nq90, 0)).mean() print(f" {'Naive':<18} WIS={naive_wis:.4f}") # 3M print(f"\n--- 3M INTERVAL ---") m3 = results['actual_ret_3m'].notna() if m3.sum() > 10: ar3 = results.loc[m3, 'actual_ret_3m'].values q10_3 = results.loc[m3, 'pred_q10_3m'].values q90_3 = results.loc[m3, 'pred_q90_3m'].values cov3 = ((ar3 >= q10_3) & (ar3 <= q90_3)).mean() wis3 = ((q90_3-q10_3) + (2/0.2)*np.maximum(q10_3-ar3, 0) + (2/0.2)*np.maximum(ar3-q90_3, 0)).mean() print(f" QR 3M (vol-adapt) Cov={cov3:.1%} WIS={wis3:.4f} n={m3.sum()}") # Vol print(f"\n--- VOL SCORE ---") pv = results['pred_vol'].values bl = results['baseline_ewma'].values for nm, prd in [('V2 BL+Resid', pv), ('EWMA', bl)]: from sklearn.metrics import mean_squared_error rmse = np.sqrt(mean_squared_error(av, prd)) corr = np.corrcoef(av, prd)[0, 1] print(f" {nm:<18} RMSE={rmse:.4f} corr={corr:+.3f}") # Risk levels print(f"\n--- RISK LEVELS ---") for lvl in ['Low', 'Medium', 'High']: mask = results['risk_level'] == lvl if mask.sum() > 0: print(f" {lvl:<8}: vol={av[mask].mean():.4f} n={mask.sum()}") # Factor frequency print(f"\n--- FACTOR FREQ ---") if 'top_factor' in results.columns: for fac, cnt in results['top_factor'].value_counts().items(): print(f" {fac:<12}: {cnt} ({cnt/n:.1%})") # SHAP print(f"\n--- SHAP (avg) ---") for g in FACTOR_GROUPS: col = f'shap_{g}' if col in results.columns: print(f" {g:<12}: {results[col].abs().mean():.4f}") # Scenario print(f"\n--- SCENARIO (latest) ---") lat = results.iloc[-1] print(f" Base: {lat['scenario_base']*100:+.1f}%") print(f" VIX shock: {lat['scenario_vix_shock']*100:+.1f}%") print(f" Supply cut: {lat['scenario_supply_cut']*100:+.1f}%") print(f" Demand crash: {lat['scenario_demand_crash']*100:+.1f}%") # NLG sample print(f"\n--- NLG REPORT (latest) ---") print(generate_nlg_report(results.iloc[-1])) # ═══════════════════════════════════════════════════════════ # ABLATION EXPERIMENTS # ═══════════════════════════════════════════════════════════ def _run_qr_eval(panel, feat_list, train_window=120): """内部辅助:对给定特征子集跑 walk-forward QR 并返回 (cov, wis, n)。""" hits, totals, wis_list = 0, 0, [] for i in range(train_window, len(panel) - 1): train_df = panel.iloc[max(0, i - train_window):i] test_df = panel.iloc[i:i + 1] avail = [f for f in feat_list if f in train_df.columns and train_df[f].notna().mean() > 0.8] if len(avail) < 2: continue X_tr = train_df[avail].fillna(train_df[avail].median()) X_te = test_df[avail].fillna(train_df[avail].median()) sc = StandardScaler() X_tr_s = sc.fit_transform(X_tr) X_te_s = sc.transform(X_te) y = train_df['target_ret_1m'].dropna() mask = y.index.isin(X_tr.index) y = y[mask] X_tr_s = X_tr_s[:len(y)] actual = panel['target_ret_1m'].iloc[i] if np.isnan(actual): continue try: qr10 = QuantileRegressor(quantile=0.10, alpha=0.1, solver='highs') qr90 = QuantileRegressor(quantile=0.90, alpha=0.1, solver='highs') qr10.fit(X_tr_s, y) qr90.fit(X_tr_s, y) p10, p90 = qr10.predict(X_te_s)[0], qr90.predict(X_te_s)[0] if p10 > p90: p10, p90 = p90, p10 hit = 1 if p10 <= actual <= p90 else 0 hits += hit totals += 1 wis_list.append((p90-p10) + (2/0.2)*max(p10-actual, 0) + (2/0.2)*max(actual-p90, 0)) except: continue cov = hits / totals if totals > 0 else 0 wis = np.mean(wis_list) if wis_list else 999 return cov, wis, totals def run_ablation(panel, features): """运行消融实验:训练窗口 + 因子组 leave-one-out。""" print(f"\n--- ABLATION EXPERIMENTS ---") ablation_results = [] # ── Part 1: Window ablation ── print(" [窗口消融]") for w in [84, 120, 180]: cov, wis, n_ab = _run_qr_eval(panel, features, train_window=w) ablation_results.append({ 'type': 'window', 'param': w, 'param_label': f'{w}月', 'cov': round(cov, 3), 'wis': round(wis, 4), 'n': n_ab }) print(f" Window={w:>3}: Cov={cov:.1%} WIS={wis:.4f} n={n_ab}") # ── Part 2: Factor-group leave-one-out ── print(" [因子组消融 — Leave-One-Out]") # Baseline: all factors base_cov, base_wis, base_n = _run_qr_eval(panel, features) ablation_results.append({ 'type': 'factor_group', 'param': 'ALL', 'param_label': '全部因子', 'cov': round(base_cov, 3), 'wis': round(base_wis, 4), 'n': base_n }) print(f" ALL (baseline): Cov={base_cov:.1%} WIS={base_wis:.4f}") group_zh = {'Price': '价格联动', 'Supply': '供给端', 'Demand': '需求端', 'Risk_Geo': '地缘/风险', 'Technical': '技术面'} for group, members in FACTOR_GROUPS.items(): # Remove this group's features reduced = [f for f in features if f not in members] if len(reduced) < 2: continue cov, wis, n_ab = _run_qr_eval(panel, reduced) delta_cov = cov - base_cov delta_wis = wis - base_wis ablation_results.append({ 'type': 'factor_group', 'param': group, 'param_label': f'去除{group_zh.get(group, group)}', 'cov': round(cov, 3), 'wis': round(wis, 4), 'n': n_ab, 'delta_cov': round(delta_cov, 3), 'delta_wis': round(delta_wis, 4), }) print(f" -{group:<12}: Cov={cov:.1%} (Δ{delta_cov:+.1%}) " f"WIS={wis:.4f} (Δ{delta_wis:+.4f})") return ablation_results