""" Market Events Scraper - Earnings, Economic Indicators & Central Bank Events Aggregates upcoming and recent market-moving events Web scraping approach - no API keys required """ from datetime import datetime, timedelta from typing import List, Dict, Optional import logging import re from concurrent.futures import ThreadPoolExecutor import requests import feedparser from bs4 import BeautifulSoup # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class MarketEventsScraper: """ Scrapes market events from multiple sources Focus: Earnings, economic indicators, central bank announcements """ # Central bank RSS feeds (already in use for news) CENTRAL_BANKS = { 'fed': { 'name': 'Federal Reserve', 'rss': 'https://www.federalreserve.gov/feeds/press_all.xml', 'weight': 2.0 }, 'ecb': { 'name': 'European Central Bank', 'rss': 'https://www.ecb.europa.eu/rss/press.xml', 'weight': 2.0 } } def __init__(self): """Initialize scraper""" self.session = requests.Session() self.session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.9', }) def scrape_market_events(self, max_items: int = 50, days_ahead: int = 14) -> List[Dict]: """ Scrape market events from all sources Returns unified list sorted by date and impact """ all_events = [] seen_urls = set() # Parallel fetching with ThreadPoolExecutor(max_workers=3) as executor: futures = [] # Submit tasks futures.append((executor.submit(self._fetch_earnings), 'earnings')) futures.append((executor.submit(self._fetch_economic_indicators), 'indicators')) futures.append((executor.submit(self._fetch_central_bank_events), 'central_banks')) for future, source_type in futures: try: events = future.result(timeout=35) # Deduplicate by URL for event in events: if event['url'] not in seen_urls: seen_urls.add(event['url']) all_events.append(event) logger.info(f"Fetched {len(events)} events from {source_type}") except Exception as e: logger.error(f"Error fetching {source_type}: {e}") # If no events fetched, use mock data if not all_events: logger.warning("No market events fetched - using mock data") return self._get_mock_events() # Sort by event date and impact all_events.sort( key=lambda x: (x.get('event_date', x['timestamp']), x['impact'] != 'high'), ) return all_events[:max_items] def _fetch_earnings(self) -> List[Dict]: """ Fetch earnings calendar from Yahoo Finance Web scraping approach """ try: url = 'https://finance.yahoo.com/calendar/earnings' response = self.session.get(url, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') events = [] # Yahoo Finance uses a table for earnings table = soup.find('table', {'class': re.compile('earnings')}) if not table: logger.warning("Could not find earnings table on Yahoo Finance") return self._get_mock_earnings() rows = table.find_all('tr')[1:20] # Skip header, limit to 20 for row in rows: try: cells = row.find_all('td') if len(cells) < 4: continue # Parse cells ticker = cells[0].get_text(strip=True) company = cells[1].get_text(strip=True) if len(cells) > 1 else ticker eps_estimate = cells[2].get_text(strip=True) if len(cells) > 2 else 'N/A' reported_eps = cells[3].get_text(strip=True) if len(cells) > 3 else None event_time = cells[4].get_text(strip=True) if len(cells) > 4 else 'N/A' # Create event event_date = self._parse_earnings_date(event_time) events.append({ 'id': hash(f"earnings_{ticker}_{event_date}"), 'title': f"{company} ({ticker}) Earnings Report", 'summary': f"Expected EPS: {eps_estimate}" + (f", Reported: {reported_eps}" if reported_eps and reported_eps != 'N/A' else ''), 'source': 'Yahoo Finance', 'category': 'earnings', 'timestamp': datetime.now(), 'event_date': event_date, 'url': f"https://finance.yahoo.com/quote/{ticker}", 'event_type': 'earnings', 'ticker': ticker, 'expected_value': self._parse_float(eps_estimate), 'actual_value': self._parse_float(reported_eps) if reported_eps else None, 'previous_value': None, 'impact': 'medium', # Earnings are generally medium impact 'sentiment': self._determine_earnings_sentiment(eps_estimate, reported_eps), 'is_breaking': False, 'source_weight': 1.3, 'likes': 0, 'retweets': 0 }) except Exception as e: logger.debug(f"Error parsing earnings row: {e}") continue return events if events else self._get_mock_earnings() except Exception as e: logger.error(f"Error fetching earnings: {e}") return self._get_mock_earnings() def _fetch_economic_indicators(self) -> List[Dict]: """ Fetch economic indicators from FRED and other sources Uses RSS feeds """ try: events = [] # FRED Economic Data releases (via RSS - if available) # For now, use mock data as FRED RSS is primarily historical data # Real implementation would scrape FRED release calendar events.extend(self._get_mock_indicators()) return events except Exception as e: logger.error(f"Error fetching economic indicators: {e}") return self._get_mock_indicators() def _fetch_central_bank_events(self) -> List[Dict]: """ Fetch central bank announcements from RSS feeds """ events = [] for bank_id, bank_info in self.CENTRAL_BANKS.items(): try: feed = feedparser.parse(bank_info['rss']) for entry in feed.entries[:10]: try: # Parse timestamp if hasattr(entry, 'published_parsed') and entry.published_parsed: timestamp = datetime(*entry.published_parsed[:6]) else: timestamp = datetime.now() # Skip old events (>7 days) if (datetime.now() - timestamp).days > 7: continue title = entry.get('title', '') summary = entry.get('summary', '') or title url = entry.get('link', '') # Clean HTML from summary if summary: summary = BeautifulSoup(summary, 'html.parser').get_text() summary = summary[:200] + '...' if len(summary) > 200 else summary events.append({ 'id': hash(url), 'title': f"{bank_info['name']}: {title}", 'summary': summary, 'source': bank_info['name'], 'category': 'central_bank', 'timestamp': timestamp, 'event_date': timestamp, 'url': url, 'event_type': 'central_bank_announcement', 'ticker': None, 'expected_value': None, 'actual_value': None, 'previous_value': None, 'impact': 'high', # Central bank events are high impact 'sentiment': 'neutral', 'is_breaking': (datetime.now() - timestamp).days < 1, 'source_weight': bank_info['weight'], 'likes': 0, 'retweets': 0 }) except Exception as e: logger.debug(f"Error parsing {bank_id} entry: {e}") continue except Exception as e: logger.error(f"Error fetching {bank_id} RSS: {e}") return events def _parse_earnings_date(self, time_str: str) -> datetime: """Parse earnings report time""" # Yahoo Finance uses "Before Market Open", "After Market Close", or specific dates now = datetime.now() if 'Before Market' in time_str or 'BMO' in time_str: return now.replace(hour=7, minute=0, second=0, microsecond=0) elif 'After Market' in time_str or 'AMC' in time_str: return now.replace(hour=16, minute=0, second=0, microsecond=0) else: # Default to tomorrow morning return (now + timedelta(days=1)).replace(hour=7, minute=0, second=0, microsecond=0) def _parse_float(self, value_str: str) -> Optional[float]: """Parse float from string""" if not value_str or value_str == 'N/A' or value_str == '-': return None try: # Remove $ and other non-numeric characters except . and - cleaned = re.sub(r'[^\d.-]', '', value_str) return float(cleaned) except: return None def _determine_earnings_sentiment(self, expected: str, actual: Optional[str]) -> str: """Determine sentiment based on earnings beat/miss""" if not actual or actual == 'N/A': return 'neutral' exp_val = self._parse_float(expected) act_val = self._parse_float(actual) if exp_val is None or act_val is None: return 'neutral' if act_val > exp_val: return 'positive' # Beat elif act_val < exp_val: return 'negative' # Miss else: return 'neutral' # In-line def _get_mock_earnings(self) -> List[Dict]: """Mock earnings data""" now = datetime.now() return [ { 'id': 1, 'title': 'Apple Inc. (AAPL) Earnings Report', 'summary': 'Expected EPS: $2.10', 'source': 'Yahoo Finance', 'category': 'earnings', 'timestamp': now, 'event_date': now + timedelta(days=2, hours=16), 'url': 'https://finance.yahoo.com/quote/AAPL', 'event_type': 'earnings', 'ticker': 'AAPL', 'expected_value': 2.10, 'actual_value': None, 'previous_value': 1.95, 'impact': 'high', 'sentiment': 'neutral', 'is_breaking': False, 'source_weight': 1.5, 'likes': 0, 'retweets': 0 }, { 'id': 2, 'title': 'Microsoft Corporation (MSFT) Earnings Report', 'summary': 'Expected EPS: $2.75', 'source': 'Yahoo Finance', 'category': 'earnings', 'timestamp': now, 'event_date': now + timedelta(days=3, hours=16), 'url': 'https://finance.yahoo.com/quote/MSFT', 'event_type': 'earnings', 'ticker': 'MSFT', 'expected_value': 2.75, 'actual_value': None, 'previous_value': 2.50, 'impact': 'high', 'sentiment': 'neutral', 'is_breaking': False, 'source_weight': 1.5, 'likes': 0, 'retweets': 0 } ] def _get_mock_indicators(self) -> List[Dict]: """Mock economic indicator data""" now = datetime.now() return [ { 'id': 3, 'title': 'US Retail Sales Data Release', 'summary': 'Monthly retail sales figures', 'source': 'US Census Bureau', 'category': 'economic_indicator', 'timestamp': now, 'event_date': now + timedelta(days=1, hours=8, minutes=30), 'url': 'https://www.census.gov/retail/', 'event_type': 'retail_sales', 'ticker': None, 'expected_value': 0.5, 'actual_value': None, 'previous_value': 0.3, 'impact': 'medium', 'sentiment': 'neutral', 'is_breaking': False, 'source_weight': 1.6, 'likes': 0, 'retweets': 0 } ] def _get_mock_events(self) -> List[Dict]: """Combined mock data""" return self._get_mock_earnings() + self._get_mock_indicators() + [ { 'id': 4, 'title': 'Federal Reserve: FOMC Meeting Minutes Released', 'summary': 'Minutes from the latest Federal Open Market Committee meeting', 'source': 'Federal Reserve', 'category': 'central_bank', 'timestamp': datetime.now() - timedelta(hours=2), 'event_date': datetime.now() - timedelta(hours=2), 'url': 'https://www.federalreserve.gov/', 'event_type': 'central_bank_announcement', 'ticker': None, 'expected_value': None, 'actual_value': None, 'previous_value': None, 'impact': 'high', 'sentiment': 'neutral', 'is_breaking': True, 'source_weight': 2.0, 'likes': 0, 'retweets': 0 } ]