Spaces:
Configuration error
Configuration error
| """ | |
| 数据采集与处理层 | |
| Market Data Structure Layer (FIN 510, FIN 551) | |
| """ | |
| import pandas as pd | |
| import numpy as np | |
| import yfinance as yf | |
| from datetime import datetime, timedelta | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import logging | |
| from typing import Dict, List, Optional, Tuple | |
| class HKStockDataFetcher: | |
| """港股数据获取器""" | |
| def __init__(self, cache_dir="./cache"): | |
| self.cache_dir = cache_dir | |
| self.logger = logging.getLogger(__name__) | |
| def fetch_hk_stock_data(self, symbol: str, period: str = "60d", interval: str = "15m") -> pd.DataFrame: | |
| """ | |
| 从yfinance获取港股数据 | |
| Args: | |
| symbol: 股票代码,如 '0700.HK' (腾讯) | |
| period: 数据周期 (1d, 5d, 1mo, 3mo, 6mo, 1y, 2y, 5y, 10y, ytd, max) | |
| interval: 数据间隔 (1m, 2m, 5m, 15m, 30m, 60m, 90m, 1h, 1d, 5d, 1wk, 1mo, 3mo) | |
| Returns: | |
| DataFrame: 包含OHLCV数据的DataFrame | |
| """ | |
| try: | |
| stock = yf.Ticker(symbol) | |
| df = stock.history(period=period, interval=interval) | |
| if df.empty: | |
| raise ValueError(f"No data found for symbol {symbol}") | |
| # 清理数据 | |
| df = self._clean_price_data(df) | |
| self.logger.info(f"Successfully fetched {len(df)} records for {symbol}") | |
| return df | |
| except Exception as e: | |
| self.logger.error(f"Error fetching data for {symbol}: {str(e)}") | |
| raise | |
| def fetch_multiple_stocks(self, symbols: List[str], period: str = "60d", interval: str = "15m") -> Dict[str, pd.DataFrame]: | |
| """ | |
| 批量获取多只股票数据 | |
| Args: | |
| symbols: 股票代码列表 | |
| period: 数据周期 | |
| interval: 数据间隔 | |
| Returns: | |
| Dict: 股票代码为键,数据DataFrame为值的字典 | |
| """ | |
| stock_data = {} | |
| for symbol in symbols: | |
| try: | |
| data = self.fetch_hk_stock_data(symbol, period, interval) | |
| stock_data[symbol] = data | |
| except Exception as e: | |
| self.logger.warning(f"Failed to fetch data for {symbol}: {str(e)}") | |
| return stock_data | |
| def _clean_price_data(self, df: pd.DataFrame) -> pd.DataFrame: | |
| """清理价格数据""" | |
| # 移除空值 | |
| df = df.dropna() | |
| # 移除异常值(价格变化超过50%的点) | |
| for col in ['Open', 'High', 'Low', 'Close']: | |
| if col in df.columns: | |
| pct_change = df[col].pct_change().abs() | |
| df = df[pct_change < 0.5] # 移除变化超过50%的异常点 | |
| # 确保Volume为正数 | |
| if 'Volume' in df.columns: | |
| df = df[df['Volume'] > 0] | |
| return df | |
| def get_stock_info(self, symbol: str) -> Dict: | |
| """获取股票基本信息""" | |
| try: | |
| stock = yf.Ticker(symbol) | |
| info = stock.info | |
| return { | |
| 'symbol': symbol, | |
| 'longName': info.get('longName', ''), | |
| 'sector': info.get('sector', ''), | |
| 'industry': info.get('industry', ''), | |
| 'marketCap': info.get('marketCap', 0), | |
| 'currency': info.get('currency', 'HKD'), | |
| 'exchange': info.get('exchange', 'HKG') | |
| } | |
| except Exception as e: | |
| self.logger.error(f"Error fetching info for {symbol}: {str(e)}") | |
| return {'symbol': symbol} | |
| class NewsDataFetcher: | |
| """新闻数据获取器""" | |
| def __init__(self): | |
| self.logger = logging.getLogger(__name__) | |
| self.headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
| } | |
| def fetch_financial_news(self, stock_symbol: str, max_headlines: int = 10) -> List[str]: | |
| """ | |
| 获取股票相关新闻标题 | |
| Args: | |
| stock_symbol: 股票代码 | |
| max_headlines: 最大新闻条数 | |
| Returns: | |
| List: 新闻标题列表 | |
| """ | |
| headlines = [] | |
| try: | |
| # 构建搜索查询 | |
| company_name = self._get_company_name(stock_symbol) | |
| query = f"{company_name} {stock_symbol.replace('.HK', '')} 港股" | |
| # 搜索新闻 | |
| headlines.extend(self._search_google_news(query, max_headlines // 2)) | |
| headlines.extend(self._search_yahoo_news(stock_symbol, max_headlines // 2)) | |
| # 去重并限制数量 | |
| unique_headlines = list(dict.fromkeys(headlines))[:max_headlines] | |
| self.logger.info(f"Fetched {len(unique_headlines)} headlines for {stock_symbol}") | |
| return unique_headlines | |
| except Exception as e: | |
| self.logger.error(f"Error fetching news for {stock_symbol}: {str(e)}") | |
| return [f"No recent news available for {stock_symbol}"] | |
| def _get_company_name(self, stock_symbol: str) -> str: | |
| """获取公司名称""" | |
| company_names = { | |
| '0700.HK': '腾讯', | |
| '0941.HK': '中国移动', | |
| '1299.HK': '友邦保险', | |
| '2318.HK': '中国平安', | |
| '0005.HK': '汇丰控股', | |
| '0175.HK': '吉利汽车', | |
| '3690.HK': '美团', | |
| '9988.HK': '阿里巴巴', | |
| '1810.HK': '小米集团', | |
| '2020.HK': '安踏体育' | |
| } | |
| return company_names.get(stock_symbol, stock_symbol.replace('.HK', '')) | |
| def _search_google_news(self, query: str, max_results: int) -> List[str]: | |
| """搜索Google新闻""" | |
| headlines = [] | |
| try: | |
| url = f"https://news.google.com/search?q={query}&hl=zh-CN" | |
| response = requests.get(url, headers=self.headers, timeout=10) | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| # 提取标题 | |
| title_elements = soup.find_all('h3', class_='ipQwMb') | |
| for element in title_elements[:max_results]: | |
| headline = element.get_text().strip() | |
| if headline and len(headline) > 10: # 过滤太短的标题 | |
| headlines.append(headline) | |
| except Exception as e: | |
| self.logger.warning(f"Google news search failed: {str(e)}") | |
| return headlines | |
| def _search_yahoo_news(self, stock_symbol: str, max_results: int) -> List[str]: | |
| """搜索Yahoo财经新闻""" | |
| headlines = [] | |
| try: | |
| # 使用yfinance获取新闻 | |
| stock = yf.Ticker(stock_symbol) | |
| news = stock.news | |
| for item in news[:max_results]: | |
| title = item.get('title', '') | |
| if title and len(title) > 10: | |
| headlines.append(title) | |
| except Exception as e: | |
| self.logger.warning(f"Yahoo news search failed: {str(e)}") | |
| return headlines | |
| class MarketDataProcessor: | |
| """市场数据处理器""" | |
| def __init__(self): | |
| self.logger = logging.getLogger(__name__) | |
| def process_intraday_data(self, df: pd.DataFrame) -> pd.DataFrame: | |
| """ | |
| 处理日内数据 | |
| Args: | |
| df: 原始价格数据 | |
| Returns: | |
| DataFrame: 处理后的数据 | |
| """ | |
| processed_df = df.copy() | |
| # 计算基本技术指标 | |
| processed_df['Returns'] = processed_df['Close'].pct_change() | |
| processed_df['Log_Returns'] = np.log(processed_df['Close'] / processed_df['Close'].shift(1)) | |
| processed_df['HL_Pct'] = (processed_df['High'] - processed_df['Low']) / processed_df['Close'] | |
| processed_df['OC_Pct'] = (processed_df['Close'] - processed_df['Open']) / processed_df['Open'] | |
| # 计算成交量相关指标 | |
| processed_df['Volume_MA_5'] = processed_df['Volume'].rolling(window=5).mean() | |
| processed_df['Volume_Ratio'] = processed_df['Volume'] / processed_df['Volume_MA_5'] | |
| processed_df['VWAP'] = self._calculate_vwap(processed_df) | |
| # 计算波动率指标 | |
| processed_df['Volatility_5'] = processed_df['Returns'].rolling(window=5).std() | |
| processed_df['Volatility_20'] = processed_df['Returns'].rolling(window=20).std() | |
| processed_df['ATR'] = self._calculate_atr(processed_df) | |
| # 添加时间特征 | |
| processed_df = self._add_time_features(processed_df) | |
| return processed_df.dropna() | |
| def _calculate_vwap(self, df: pd.DataFrame) -> pd.Series: | |
| """计算成交量加权平均价格""" | |
| typical_price = (df['High'] + df['Low'] + df['Close']) / 3 | |
| vwap = (typical_price * df['Volume']).cumsum() / df['Volume'].cumsum() | |
| return vwap | |
| def _calculate_atr(self, df: pd.DataFrame, period: int = 14) -> pd.Series: | |
| """计算真实波幅Average True Range""" | |
| high_low = df['High'] - df['Low'] | |
| high_close = np.abs(df['High'] - df['Close'].shift()) | |
| low_close = np.abs(df['Low'] - df['Close'].shift()) | |
| tr = np.maximum(high_low, np.maximum(high_close, low_close)) | |
| atr = tr.rolling(window=period).mean() | |
| return atr | |
| def _add_time_features(self, df: pd.DataFrame) -> pd.DataFrame: | |
| """添加时间特征""" | |
| df = df.copy() | |
| df['Hour'] = df.index.hour | |
| df['Minute'] = df.index.minute | |
| df['DayOfWeek'] = df.index.dayofweek | |
| # 市场时段划分 | |
| df['Market_Session'] = 'Regular' | |
| df.loc[df['Hour'] < 10, 'Market_Session'] = 'Pre_Market' | |
| df.loc[df['Hour'] >= 16, 'Market_Session'] = 'After_Market' | |
| df.loc[(df['Hour'] == 12) & (df['Minute'] >= 0) & (df['Minute'] < 60), 'Market_Session'] = 'Lunch_Break' | |
| return df | |
| def calculate_microstructure_features(self, df: pd.DataFrame) -> pd.DataFrame: | |
| """计算微结构特征""" | |
| features = pd.DataFrame(index=df.index) | |
| # 价格冲击指标 | |
| features['Price_Impact'] = np.abs(df['Returns']) / np.log(1 + df['Volume']) | |
| # 订单流不平衡代理 | |
| features['Order_Imbalance'] = (df['Close'] - df['Open']) / (df['High'] - df['Low'] + 1e-8) | |
| # 流动性指标 | |
| features['Bid_Ask_Spread'] = (df['High'] - df['Low']) / df['Close'] | |
| features['Market_Impact'] = np.abs(df['Returns']) / (df['Volume'] + 1e-8) | |
| # 信息流指标 | |
| features['Info_Ratio'] = np.abs(df['Returns']) / df['Volatility_20'] | |
| return features.dropna() | |
| # 使用示例和测试 | |
| if __name__ == "__main__": | |
| # 设置日志 | |
| logging.basicConfig(level=logging.INFO) | |
| # 初始化数据获取器 | |
| data_fetcher = HKStockDataFetcher() | |
| news_fetcher = NewsDataFetcher() | |
| processor = MarketDataProcessor() | |
| # 测试数据获取 | |
| symbol = '0700.HK' | |
| print(f"Testing data fetching for {symbol}...") | |
| # 获取价格数据 | |
| price_data = data_fetcher.fetch_hk_stock_data(symbol, period="30d", interval="15m") | |
| print(f"Price data shape: {price_data.shape}") | |
| print(price_data.head()) | |
| # 处理数据 | |
| processed_data = processor.process_intraday_data(price_data) | |
| print(f"Processed data shape: {processed_data.shape}") | |
| print(processed_data.columns.tolist()) | |
| # 获取新闻 | |
| news = news_fetcher.fetch_financial_news(symbol, max_headlines=5) | |
| print(f"News headlines: {news}") | |
| # 计算微结构特征 | |
| micro_features = processor.calculate_microstructure_features(processed_data) | |
| print(f"Microstructure features shape: {micro_features.shape}") | |
| print(micro_features.head()) | |