""" GitHub Actions 用: 从 World Bank 下载商品价格 Pink Sheet v3: 不依赖日期解析,直接保存所有数据 """ import pandas as pd import os OUTPUT = 'data/cloud' os.makedirs(OUTPUT, exist_ok=True) print("Fetching World Bank Pink Sheet...") url = "https://thedocs.worldbank.org/en/doc/5d903e848db1d1b83e0ec8f744e55570-0350012021/related/CMO-Historical-Data-Monthly.xlsx" try: xls = pd.ExcelFile(url, engine='openpyxl') print(f" Sheets: {xls.sheet_names}") # 读 Monthly Prices,跳过标题行 df = pd.read_excel(xls, sheet_name='Monthly Prices', skiprows=4, index_col=0) print(f" Raw: {df.shape}") print(f" Index samples: {list(df.index[:3])}, type: {type(df.index[0])}") # 日期在 index 里,可能是 "1960M01" 格式 # 直接保留原始 index,不做 to_datetime # 只保留有数值的行(去掉纯文字行) first_col = df.iloc[:, 0] numeric_mask = pd.to_numeric(first_col, errors='coerce').notna() df = df[numeric_mask] print(f" After numeric filter: {df.shape}") # 转数值 df_out = df.apply(pd.to_numeric, errors='coerce') df_out = df_out.dropna(how='all') outpath = os.path.join(OUTPUT, 'worldbank_commodities.csv') df_out.to_csv(outpath) print(f"✓ Saved {len(df_out)} rows × {len(df_out.columns)} cols") print(f" Last 3 rows index: {list(df_out.index[-3:])}") print(f" Columns: {list(df_out.columns[:10])}...") except Exception as e: print(f"✗ Failed: {e}") import traceback traceback.print_exc()