UnifiedFinancialPlatform / app /services /market_events.py
Dmitry Beresnev
init project
e189a31
"""
Market Events Scraper - Earnings, Economic Indicators & Central Bank Events
Aggregates upcoming and recent market-moving events
Web scraping approach - no API keys required
"""
from datetime import datetime, timedelta
from typing import List, Dict, Optional
import logging
import re
from concurrent.futures import ThreadPoolExecutor
import requests
import feedparser
from bs4 import BeautifulSoup
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class MarketEventsScraper:
"""
Scrapes market events from multiple sources
Focus: Earnings, economic indicators, central bank announcements
"""
# Central bank RSS feeds (already in use for news)
CENTRAL_BANKS = {
'fed': {
'name': 'Federal Reserve',
'rss': 'https://www.federalreserve.gov/feeds/press_all.xml',
'weight': 2.0
},
'ecb': {
'name': 'European Central Bank',
'rss': 'https://www.ecb.europa.eu/rss/press.xml',
'weight': 2.0
}
}
def __init__(self):
"""Initialize scraper"""
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
})
def scrape_market_events(self, max_items: int = 50, days_ahead: int = 14) -> List[Dict]:
"""
Scrape market events from all sources
Returns unified list sorted by date and impact
"""
all_events = []
seen_urls = set()
# Parallel fetching
with ThreadPoolExecutor(max_workers=3) as executor:
futures = []
# Submit tasks
futures.append((executor.submit(self._fetch_earnings), 'earnings'))
futures.append((executor.submit(self._fetch_economic_indicators), 'indicators'))
futures.append((executor.submit(self._fetch_central_bank_events), 'central_banks'))
for future, source_type in futures:
try:
events = future.result(timeout=35)
# Deduplicate by URL
for event in events:
if event['url'] not in seen_urls:
seen_urls.add(event['url'])
all_events.append(event)
logger.info(f"Fetched {len(events)} events from {source_type}")
except Exception as e:
logger.error(f"Error fetching {source_type}: {e}")
# If no events fetched, use mock data
if not all_events:
logger.warning("No market events fetched - using mock data")
return self._get_mock_events()
# Sort by event date and impact
all_events.sort(
key=lambda x: (x.get('event_date', x['timestamp']), x['impact'] != 'high'),
)
return all_events[:max_items]
def _fetch_earnings(self) -> List[Dict]:
"""
Fetch earnings calendar from Yahoo Finance
Web scraping approach
"""
try:
url = 'https://finance.yahoo.com/calendar/earnings'
response = self.session.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
events = []
# Yahoo Finance uses a table for earnings
table = soup.find('table', {'class': re.compile('earnings')})
if not table:
logger.warning("Could not find earnings table on Yahoo Finance")
return self._get_mock_earnings()
rows = table.find_all('tr')[1:20] # Skip header, limit to 20
for row in rows:
try:
cells = row.find_all('td')
if len(cells) < 4:
continue
# Parse cells
ticker = cells[0].get_text(strip=True)
company = cells[1].get_text(strip=True) if len(cells) > 1 else ticker
eps_estimate = cells[2].get_text(strip=True) if len(cells) > 2 else 'N/A'
reported_eps = cells[3].get_text(strip=True) if len(cells) > 3 else None
event_time = cells[4].get_text(strip=True) if len(cells) > 4 else 'N/A'
# Create event
event_date = self._parse_earnings_date(event_time)
events.append({
'id': hash(f"earnings_{ticker}_{event_date}"),
'title': f"{company} ({ticker}) Earnings Report",
'summary': f"Expected EPS: {eps_estimate}" + (f", Reported: {reported_eps}" if reported_eps and reported_eps != 'N/A' else ''),
'source': 'Yahoo Finance',
'category': 'earnings',
'timestamp': datetime.now(),
'event_date': event_date,
'url': f"https://finance.yahoo.com/quote/{ticker}",
'event_type': 'earnings',
'ticker': ticker,
'expected_value': self._parse_float(eps_estimate),
'actual_value': self._parse_float(reported_eps) if reported_eps else None,
'previous_value': None,
'impact': 'medium', # Earnings are generally medium impact
'sentiment': self._determine_earnings_sentiment(eps_estimate, reported_eps),
'is_breaking': False,
'source_weight': 1.3,
'likes': 0,
'retweets': 0
})
except Exception as e:
logger.debug(f"Error parsing earnings row: {e}")
continue
return events if events else self._get_mock_earnings()
except Exception as e:
logger.error(f"Error fetching earnings: {e}")
return self._get_mock_earnings()
def _fetch_economic_indicators(self) -> List[Dict]:
"""
Fetch economic indicators from FRED and other sources
Uses RSS feeds
"""
try:
events = []
# FRED Economic Data releases (via RSS - if available)
# For now, use mock data as FRED RSS is primarily historical data
# Real implementation would scrape FRED release calendar
events.extend(self._get_mock_indicators())
return events
except Exception as e:
logger.error(f"Error fetching economic indicators: {e}")
return self._get_mock_indicators()
def _fetch_central_bank_events(self) -> List[Dict]:
"""
Fetch central bank announcements from RSS feeds
"""
events = []
for bank_id, bank_info in self.CENTRAL_BANKS.items():
try:
feed = feedparser.parse(bank_info['rss'])
for entry in feed.entries[:10]:
try:
# Parse timestamp
if hasattr(entry, 'published_parsed') and entry.published_parsed:
timestamp = datetime(*entry.published_parsed[:6])
else:
timestamp = datetime.now()
# Skip old events (>7 days)
if (datetime.now() - timestamp).days > 7:
continue
title = entry.get('title', '')
summary = entry.get('summary', '') or title
url = entry.get('link', '')
# Clean HTML from summary
if summary:
summary = BeautifulSoup(summary, 'html.parser').get_text()
summary = summary[:200] + '...' if len(summary) > 200 else summary
events.append({
'id': hash(url),
'title': f"{bank_info['name']}: {title}",
'summary': summary,
'source': bank_info['name'],
'category': 'central_bank',
'timestamp': timestamp,
'event_date': timestamp,
'url': url,
'event_type': 'central_bank_announcement',
'ticker': None,
'expected_value': None,
'actual_value': None,
'previous_value': None,
'impact': 'high', # Central bank events are high impact
'sentiment': 'neutral',
'is_breaking': (datetime.now() - timestamp).days < 1,
'source_weight': bank_info['weight'],
'likes': 0,
'retweets': 0
})
except Exception as e:
logger.debug(f"Error parsing {bank_id} entry: {e}")
continue
except Exception as e:
logger.error(f"Error fetching {bank_id} RSS: {e}")
return events
def _parse_earnings_date(self, time_str: str) -> datetime:
"""Parse earnings report time"""
# Yahoo Finance uses "Before Market Open", "After Market Close", or specific dates
now = datetime.now()
if 'Before Market' in time_str or 'BMO' in time_str:
return now.replace(hour=7, minute=0, second=0, microsecond=0)
elif 'After Market' in time_str or 'AMC' in time_str:
return now.replace(hour=16, minute=0, second=0, microsecond=0)
else:
# Default to tomorrow morning
return (now + timedelta(days=1)).replace(hour=7, minute=0, second=0, microsecond=0)
def _parse_float(self, value_str: str) -> Optional[float]:
"""Parse float from string"""
if not value_str or value_str == 'N/A' or value_str == '-':
return None
try:
# Remove $ and other non-numeric characters except . and -
cleaned = re.sub(r'[^\d.-]', '', value_str)
return float(cleaned)
except:
return None
def _determine_earnings_sentiment(self, expected: str, actual: Optional[str]) -> str:
"""Determine sentiment based on earnings beat/miss"""
if not actual or actual == 'N/A':
return 'neutral'
exp_val = self._parse_float(expected)
act_val = self._parse_float(actual)
if exp_val is None or act_val is None:
return 'neutral'
if act_val > exp_val:
return 'positive' # Beat
elif act_val < exp_val:
return 'negative' # Miss
else:
return 'neutral' # In-line
def _get_mock_earnings(self) -> List[Dict]:
"""Mock earnings data"""
now = datetime.now()
return [
{
'id': 1,
'title': 'Apple Inc. (AAPL) Earnings Report',
'summary': 'Expected EPS: $2.10',
'source': 'Yahoo Finance',
'category': 'earnings',
'timestamp': now,
'event_date': now + timedelta(days=2, hours=16),
'url': 'https://finance.yahoo.com/quote/AAPL',
'event_type': 'earnings',
'ticker': 'AAPL',
'expected_value': 2.10,
'actual_value': None,
'previous_value': 1.95,
'impact': 'high',
'sentiment': 'neutral',
'is_breaking': False,
'source_weight': 1.5,
'likes': 0,
'retweets': 0
},
{
'id': 2,
'title': 'Microsoft Corporation (MSFT) Earnings Report',
'summary': 'Expected EPS: $2.75',
'source': 'Yahoo Finance',
'category': 'earnings',
'timestamp': now,
'event_date': now + timedelta(days=3, hours=16),
'url': 'https://finance.yahoo.com/quote/MSFT',
'event_type': 'earnings',
'ticker': 'MSFT',
'expected_value': 2.75,
'actual_value': None,
'previous_value': 2.50,
'impact': 'high',
'sentiment': 'neutral',
'is_breaking': False,
'source_weight': 1.5,
'likes': 0,
'retweets': 0
}
]
def _get_mock_indicators(self) -> List[Dict]:
"""Mock economic indicator data"""
now = datetime.now()
return [
{
'id': 3,
'title': 'US Retail Sales Data Release',
'summary': 'Monthly retail sales figures',
'source': 'US Census Bureau',
'category': 'economic_indicator',
'timestamp': now,
'event_date': now + timedelta(days=1, hours=8, minutes=30),
'url': 'https://www.census.gov/retail/',
'event_type': 'retail_sales',
'ticker': None,
'expected_value': 0.5,
'actual_value': None,
'previous_value': 0.3,
'impact': 'medium',
'sentiment': 'neutral',
'is_breaking': False,
'source_weight': 1.6,
'likes': 0,
'retweets': 0
}
]
def _get_mock_events(self) -> List[Dict]:
"""Combined mock data"""
return self._get_mock_earnings() + self._get_mock_indicators() + [
{
'id': 4,
'title': 'Federal Reserve: FOMC Meeting Minutes Released',
'summary': 'Minutes from the latest Federal Open Market Committee meeting',
'source': 'Federal Reserve',
'category': 'central_bank',
'timestamp': datetime.now() - timedelta(hours=2),
'event_date': datetime.now() - timedelta(hours=2),
'url': 'https://www.federalreserve.gov/',
'event_type': 'central_bank_announcement',
'ticker': None,
'expected_value': None,
'actual_value': None,
'previous_value': None,
'impact': 'high',
'sentiment': 'neutral',
'is_breaking': True,
'source_weight': 2.0,
'likes': 0,
'retweets': 0
}
]