oilverse-api / scripts /fetch_worldbank.py
孙家明
deploy: OilVerse for HuggingFace (Node.js 18 fix)
fab9847
"""
GitHub Actions 用: 从 World Bank 下载商品价格 Pink Sheet
v3: 不依赖日期解析,直接保存所有数据
"""
import pandas as pd
import os
OUTPUT = 'data/cloud'
os.makedirs(OUTPUT, exist_ok=True)
print("Fetching World Bank Pink Sheet...")
url = "https://thedocs.worldbank.org/en/doc/5d903e848db1d1b83e0ec8f744e55570-0350012021/related/CMO-Historical-Data-Monthly.xlsx"
try:
xls = pd.ExcelFile(url, engine='openpyxl')
print(f" Sheets: {xls.sheet_names}")
# 读 Monthly Prices,跳过标题行
df = pd.read_excel(xls, sheet_name='Monthly Prices', skiprows=4, index_col=0)
print(f" Raw: {df.shape}")
print(f" Index samples: {list(df.index[:3])}, type: {type(df.index[0])}")
# 日期在 index 里,可能是 "1960M01" 格式
# 直接保留原始 index,不做 to_datetime
# 只保留有数值的行(去掉纯文字行)
first_col = df.iloc[:, 0]
numeric_mask = pd.to_numeric(first_col, errors='coerce').notna()
df = df[numeric_mask]
print(f" After numeric filter: {df.shape}")
# 转数值
df_out = df.apply(pd.to_numeric, errors='coerce')
df_out = df_out.dropna(how='all')
outpath = os.path.join(OUTPUT, 'worldbank_commodities.csv')
df_out.to_csv(outpath)
print(f"✓ Saved {len(df_out)} rows × {len(df_out.columns)} cols")
print(f" Last 3 rows index: {list(df_out.index[-3:])}")
print(f" Columns: {list(df_out.columns[:10])}...")
except Exception as e:
print(f"✗ Failed: {e}")
import traceback
traceback.print_exc()