"""
GitHub Actions 用: 从 World Bank 下载商品价格 Pink Sheet
v3: 不依赖日期解析，直接保存所有数据
"""
import pandas as pd
import os

OUTPUT = 'data/cloud'
os.makedirs(OUTPUT, exist_ok=True)

print("Fetching World Bank Pink Sheet...")
url = "https://thedocs.worldbank.org/en/doc/5d903e848db1d1b83e0ec8f744e55570-0350012021/related/CMO-Historical-Data-Monthly.xlsx"

try:
    xls = pd.ExcelFile(url, engine='openpyxl')
    print(f"  Sheets: {xls.sheet_names}")
    
    # 读 Monthly Prices，跳过标题行
    df = pd.read_excel(xls, sheet_name='Monthly Prices', skiprows=4, index_col=0)
    print(f"  Raw: {df.shape}")
    print(f"  Index samples: {list(df.index[:3])}, type: {type(df.index[0])}")
    
    # 日期在 index 里，可能是 "1960M01" 格式
    # 直接保留原始 index，不做 to_datetime
    # 只保留有数值的行（去掉纯文字行）
    first_col = df.iloc[:, 0]
    numeric_mask = pd.to_numeric(first_col, errors='coerce').notna()
    df = df[numeric_mask]
    print(f"  After numeric filter: {df.shape}")
    
    # 转数值
    df_out = df.apply(pd.to_numeric, errors='coerce')
    df_out = df_out.dropna(how='all')
    
    outpath = os.path.join(OUTPUT, 'worldbank_commodities.csv')
    df_out.to_csv(outpath)
    print(f"✓ Saved {len(df_out)} rows × {len(df_out.columns)} cols")
    print(f"  Last 3 rows index: {list(df_out.index[-3:])}")
    print(f"  Columns: {list(df_out.columns[:10])}...")
        
except Exception as e:
    print(f"✗ Failed: {e}")
    import traceback
    traceback.print_exc()