| """ |
| Market Events Scraper - Earnings, Economic Indicators & Central Bank Events |
| Aggregates upcoming and recent market-moving events |
| Web scraping approach - no API keys required |
| """ |
|
|
| from datetime import datetime, timedelta |
| from typing import List, Dict, Optional |
| import logging |
| import re |
| from concurrent.futures import ThreadPoolExecutor |
|
|
| import requests |
| import feedparser |
| from bs4 import BeautifulSoup |
|
|
| |
| logging.basicConfig(level=logging.INFO) |
| logger = logging.getLogger(__name__) |
|
|
|
|
| class MarketEventsScraper: |
| """ |
| Scrapes market events from multiple sources |
| Focus: Earnings, economic indicators, central bank announcements |
| """ |
|
|
| |
| CENTRAL_BANKS = { |
| 'fed': { |
| 'name': 'Federal Reserve', |
| 'rss': 'https://www.federalreserve.gov/feeds/press_all.xml', |
| 'weight': 2.0 |
| }, |
| 'ecb': { |
| 'name': 'European Central Bank', |
| 'rss': 'https://www.ecb.europa.eu/rss/press.xml', |
| 'weight': 2.0 |
| } |
| } |
|
|
| def __init__(self): |
| """Initialize scraper""" |
| self.session = requests.Session() |
| self.session.headers.update({ |
| 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36', |
| 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', |
| 'Accept-Language': 'en-US,en;q=0.9', |
| }) |
|
|
| def scrape_market_events(self, max_items: int = 50, days_ahead: int = 14) -> List[Dict]: |
| """ |
| Scrape market events from all sources |
| Returns unified list sorted by date and impact |
| """ |
| all_events = [] |
| seen_urls = set() |
|
|
| |
| with ThreadPoolExecutor(max_workers=3) as executor: |
| futures = [] |
|
|
| |
| futures.append((executor.submit(self._fetch_earnings), 'earnings')) |
| futures.append((executor.submit(self._fetch_economic_indicators), 'indicators')) |
| futures.append((executor.submit(self._fetch_central_bank_events), 'central_banks')) |
|
|
| for future, source_type in futures: |
| try: |
| events = future.result(timeout=35) |
|
|
| |
| for event in events: |
| if event['url'] not in seen_urls: |
| seen_urls.add(event['url']) |
| all_events.append(event) |
|
|
| logger.info(f"Fetched {len(events)} events from {source_type}") |
|
|
| except Exception as e: |
| logger.error(f"Error fetching {source_type}: {e}") |
|
|
| |
| if not all_events: |
| logger.warning("No market events fetched - using mock data") |
| return self._get_mock_events() |
|
|
| |
| all_events.sort( |
| key=lambda x: (x.get('event_date', x['timestamp']), x['impact'] != 'high'), |
| ) |
|
|
| return all_events[:max_items] |
|
|
| def _fetch_earnings(self) -> List[Dict]: |
| """ |
| Fetch earnings calendar from Yahoo Finance |
| Web scraping approach |
| """ |
| try: |
| url = 'https://finance.yahoo.com/calendar/earnings' |
| response = self.session.get(url, timeout=10) |
| response.raise_for_status() |
|
|
| soup = BeautifulSoup(response.content, 'html.parser') |
| events = [] |
|
|
| |
| table = soup.find('table', {'class': re.compile('earnings')}) |
|
|
| if not table: |
| logger.warning("Could not find earnings table on Yahoo Finance") |
| return self._get_mock_earnings() |
|
|
| rows = table.find_all('tr')[1:20] |
|
|
| for row in rows: |
| try: |
| cells = row.find_all('td') |
| if len(cells) < 4: |
| continue |
|
|
| |
| ticker = cells[0].get_text(strip=True) |
| company = cells[1].get_text(strip=True) if len(cells) > 1 else ticker |
| eps_estimate = cells[2].get_text(strip=True) if len(cells) > 2 else 'N/A' |
| reported_eps = cells[3].get_text(strip=True) if len(cells) > 3 else None |
| event_time = cells[4].get_text(strip=True) if len(cells) > 4 else 'N/A' |
|
|
| |
| event_date = self._parse_earnings_date(event_time) |
|
|
| events.append({ |
| 'id': hash(f"earnings_{ticker}_{event_date}"), |
| 'title': f"{company} ({ticker}) Earnings Report", |
| 'summary': f"Expected EPS: {eps_estimate}" + (f", Reported: {reported_eps}" if reported_eps and reported_eps != 'N/A' else ''), |
| 'source': 'Yahoo Finance', |
| 'category': 'earnings', |
| 'timestamp': datetime.now(), |
| 'event_date': event_date, |
| 'url': f"https://finance.yahoo.com/quote/{ticker}", |
| 'event_type': 'earnings', |
| 'ticker': ticker, |
| 'expected_value': self._parse_float(eps_estimate), |
| 'actual_value': self._parse_float(reported_eps) if reported_eps else None, |
| 'previous_value': None, |
| 'impact': 'medium', |
| 'sentiment': self._determine_earnings_sentiment(eps_estimate, reported_eps), |
| 'is_breaking': False, |
| 'source_weight': 1.3, |
| 'likes': 0, |
| 'retweets': 0 |
| }) |
|
|
| except Exception as e: |
| logger.debug(f"Error parsing earnings row: {e}") |
| continue |
|
|
| return events if events else self._get_mock_earnings() |
|
|
| except Exception as e: |
| logger.error(f"Error fetching earnings: {e}") |
| return self._get_mock_earnings() |
|
|
| def _fetch_economic_indicators(self) -> List[Dict]: |
| """ |
| Fetch economic indicators from FRED and other sources |
| Uses RSS feeds |
| """ |
| try: |
| events = [] |
|
|
| |
| |
| |
|
|
| events.extend(self._get_mock_indicators()) |
|
|
| return events |
|
|
| except Exception as e: |
| logger.error(f"Error fetching economic indicators: {e}") |
| return self._get_mock_indicators() |
|
|
| def _fetch_central_bank_events(self) -> List[Dict]: |
| """ |
| Fetch central bank announcements from RSS feeds |
| """ |
| events = [] |
|
|
| for bank_id, bank_info in self.CENTRAL_BANKS.items(): |
| try: |
| feed = feedparser.parse(bank_info['rss']) |
|
|
| for entry in feed.entries[:10]: |
| try: |
| |
| if hasattr(entry, 'published_parsed') and entry.published_parsed: |
| timestamp = datetime(*entry.published_parsed[:6]) |
| else: |
| timestamp = datetime.now() |
|
|
| |
| if (datetime.now() - timestamp).days > 7: |
| continue |
|
|
| title = entry.get('title', '') |
| summary = entry.get('summary', '') or title |
| url = entry.get('link', '') |
|
|
| |
| if summary: |
| summary = BeautifulSoup(summary, 'html.parser').get_text() |
| summary = summary[:200] + '...' if len(summary) > 200 else summary |
|
|
| events.append({ |
| 'id': hash(url), |
| 'title': f"{bank_info['name']}: {title}", |
| 'summary': summary, |
| 'source': bank_info['name'], |
| 'category': 'central_bank', |
| 'timestamp': timestamp, |
| 'event_date': timestamp, |
| 'url': url, |
| 'event_type': 'central_bank_announcement', |
| 'ticker': None, |
| 'expected_value': None, |
| 'actual_value': None, |
| 'previous_value': None, |
| 'impact': 'high', |
| 'sentiment': 'neutral', |
| 'is_breaking': (datetime.now() - timestamp).days < 1, |
| 'source_weight': bank_info['weight'], |
| 'likes': 0, |
| 'retweets': 0 |
| }) |
|
|
| except Exception as e: |
| logger.debug(f"Error parsing {bank_id} entry: {e}") |
| continue |
|
|
| except Exception as e: |
| logger.error(f"Error fetching {bank_id} RSS: {e}") |
|
|
| return events |
|
|
| def _parse_earnings_date(self, time_str: str) -> datetime: |
| """Parse earnings report time""" |
| |
| now = datetime.now() |
|
|
| if 'Before Market' in time_str or 'BMO' in time_str: |
| return now.replace(hour=7, minute=0, second=0, microsecond=0) |
| elif 'After Market' in time_str or 'AMC' in time_str: |
| return now.replace(hour=16, minute=0, second=0, microsecond=0) |
| else: |
| |
| return (now + timedelta(days=1)).replace(hour=7, minute=0, second=0, microsecond=0) |
|
|
| def _parse_float(self, value_str: str) -> Optional[float]: |
| """Parse float from string""" |
| if not value_str or value_str == 'N/A' or value_str == '-': |
| return None |
|
|
| try: |
| |
| cleaned = re.sub(r'[^\d.-]', '', value_str) |
| return float(cleaned) |
| except: |
| return None |
|
|
| def _determine_earnings_sentiment(self, expected: str, actual: Optional[str]) -> str: |
| """Determine sentiment based on earnings beat/miss""" |
| if not actual or actual == 'N/A': |
| return 'neutral' |
|
|
| exp_val = self._parse_float(expected) |
| act_val = self._parse_float(actual) |
|
|
| if exp_val is None or act_val is None: |
| return 'neutral' |
|
|
| if act_val > exp_val: |
| return 'positive' |
| elif act_val < exp_val: |
| return 'negative' |
| else: |
| return 'neutral' |
|
|
| def _get_mock_earnings(self) -> List[Dict]: |
| """Mock earnings data""" |
| now = datetime.now() |
|
|
| return [ |
| { |
| 'id': 1, |
| 'title': 'Apple Inc. (AAPL) Earnings Report', |
| 'summary': 'Expected EPS: $2.10', |
| 'source': 'Yahoo Finance', |
| 'category': 'earnings', |
| 'timestamp': now, |
| 'event_date': now + timedelta(days=2, hours=16), |
| 'url': 'https://finance.yahoo.com/quote/AAPL', |
| 'event_type': 'earnings', |
| 'ticker': 'AAPL', |
| 'expected_value': 2.10, |
| 'actual_value': None, |
| 'previous_value': 1.95, |
| 'impact': 'high', |
| 'sentiment': 'neutral', |
| 'is_breaking': False, |
| 'source_weight': 1.5, |
| 'likes': 0, |
| 'retweets': 0 |
| }, |
| { |
| 'id': 2, |
| 'title': 'Microsoft Corporation (MSFT) Earnings Report', |
| 'summary': 'Expected EPS: $2.75', |
| 'source': 'Yahoo Finance', |
| 'category': 'earnings', |
| 'timestamp': now, |
| 'event_date': now + timedelta(days=3, hours=16), |
| 'url': 'https://finance.yahoo.com/quote/MSFT', |
| 'event_type': 'earnings', |
| 'ticker': 'MSFT', |
| 'expected_value': 2.75, |
| 'actual_value': None, |
| 'previous_value': 2.50, |
| 'impact': 'high', |
| 'sentiment': 'neutral', |
| 'is_breaking': False, |
| 'source_weight': 1.5, |
| 'likes': 0, |
| 'retweets': 0 |
| } |
| ] |
|
|
| def _get_mock_indicators(self) -> List[Dict]: |
| """Mock economic indicator data""" |
| now = datetime.now() |
|
|
| return [ |
| { |
| 'id': 3, |
| 'title': 'US Retail Sales Data Release', |
| 'summary': 'Monthly retail sales figures', |
| 'source': 'US Census Bureau', |
| 'category': 'economic_indicator', |
| 'timestamp': now, |
| 'event_date': now + timedelta(days=1, hours=8, minutes=30), |
| 'url': 'https://www.census.gov/retail/', |
| 'event_type': 'retail_sales', |
| 'ticker': None, |
| 'expected_value': 0.5, |
| 'actual_value': None, |
| 'previous_value': 0.3, |
| 'impact': 'medium', |
| 'sentiment': 'neutral', |
| 'is_breaking': False, |
| 'source_weight': 1.6, |
| 'likes': 0, |
| 'retweets': 0 |
| } |
| ] |
|
|
| def _get_mock_events(self) -> List[Dict]: |
| """Combined mock data""" |
| return self._get_mock_earnings() + self._get_mock_indicators() + [ |
| { |
| 'id': 4, |
| 'title': 'Federal Reserve: FOMC Meeting Minutes Released', |
| 'summary': 'Minutes from the latest Federal Open Market Committee meeting', |
| 'source': 'Federal Reserve', |
| 'category': 'central_bank', |
| 'timestamp': datetime.now() - timedelta(hours=2), |
| 'event_date': datetime.now() - timedelta(hours=2), |
| 'url': 'https://www.federalreserve.gov/', |
| 'event_type': 'central_bank_announcement', |
| 'ticker': None, |
| 'expected_value': None, |
| 'actual_value': None, |
| 'previous_value': None, |
| 'impact': 'high', |
| 'sentiment': 'neutral', |
| 'is_breaking': True, |
| 'source_weight': 2.0, |
| 'likes': 0, |
| 'retweets': 0 |
| } |
| ] |
|
|