Spaces:

ResearchEngineering
/

UnifiedFinancialPlatform

Paused

UnifiedFinancialPlatform / app /services /market_events.py

Dmitry Beresnev

init project

e189a31 about 1 month ago

15.1 kB

	"""
	Market Events Scraper - Earnings, Economic Indicators & Central Bank Events
	Aggregates upcoming and recent market-moving events
	Web scraping approach - no API keys required
	"""

	from datetime import datetime, timedelta
	from typing import List, Dict, Optional
	import logging
	import re
	from concurrent.futures import ThreadPoolExecutor

	import requests
	import feedparser
	from bs4 import BeautifulSoup

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)


	class MarketEventsScraper:
	"""
	Scrapes market events from multiple sources
	Focus: Earnings, economic indicators, central bank announcements
	"""

	# Central bank RSS feeds (already in use for news)
	CENTRAL_BANKS = {
	'fed': {
	'name': 'Federal Reserve',
	'rss': 'https://www.federalreserve.gov/feeds/press_all.xml',
	'weight': 2.0
	},
	'ecb': {
	'name': 'European Central Bank',
	'rss': 'https://www.ecb.europa.eu/rss/press.xml',
	'weight': 2.0
	}
	}

	def __init__(self):
	"""Initialize scraper"""
	self.session = requests.Session()
	self.session.headers.update({
	'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8',
	'Accept-Language': 'en-US,en;q=0.9',
	})

	def scrape_market_events(self, max_items: int = 50, days_ahead: int = 14) -> List[Dict]:
	"""
	Scrape market events from all sources
	Returns unified list sorted by date and impact
	"""
	all_events = []
	seen_urls = set()

	# Parallel fetching
	with ThreadPoolExecutor(max_workers=3) as executor:
	futures = []

	# Submit tasks
	futures.append((executor.submit(self._fetch_earnings), 'earnings'))
	futures.append((executor.submit(self._fetch_economic_indicators), 'indicators'))
	futures.append((executor.submit(self._fetch_central_bank_events), 'central_banks'))

	for future, source_type in futures:
	try:
	events = future.result(timeout=35)

	# Deduplicate by URL
	for event in events:
	if event['url'] not in seen_urls:
	seen_urls.add(event['url'])
	all_events.append(event)

	logger.info(f"Fetched {len(events)} events from {source_type}")

	except Exception as e:
	logger.error(f"Error fetching {source_type}: {e}")

	# If no events fetched, use mock data
	if not all_events:
	logger.warning("No market events fetched - using mock data")
	return self._get_mock_events()

	# Sort by event date and impact
	all_events.sort(
	key=lambda x: (x.get('event_date', x['timestamp']), x['impact'] != 'high'),
	)

	return all_events[:max_items]

	def _fetch_earnings(self) -> List[Dict]:
	"""
	Fetch earnings calendar from Yahoo Finance
	Web scraping approach
	"""
	try:
	url = 'https://finance.yahoo.com/calendar/earnings'
	response = self.session.get(url, timeout=10)
	response.raise_for_status()

	soup = BeautifulSoup(response.content, 'html.parser')
	events = []

	# Yahoo Finance uses a table for earnings
	table = soup.find('table', {'class': re.compile('earnings')})

	if not table:
	logger.warning("Could not find earnings table on Yahoo Finance")
	return self._get_mock_earnings()

	rows = table.find_all('tr')[1:20] # Skip header, limit to 20

	for row in rows:
	try:
	cells = row.find_all('td')
	if len(cells) < 4:
	continue

	# Parse cells
	ticker = cells[0].get_text(strip=True)
	company = cells[1].get_text(strip=True) if len(cells) > 1 else ticker
	eps_estimate = cells[2].get_text(strip=True) if len(cells) > 2 else 'N/A'
	reported_eps = cells[3].get_text(strip=True) if len(cells) > 3 else None
	event_time = cells[4].get_text(strip=True) if len(cells) > 4 else 'N/A'

	# Create event
	event_date = self._parse_earnings_date(event_time)

	events.append({
	'id': hash(f"earnings_{ticker}_{event_date}"),
	'title': f"{company} ({ticker}) Earnings Report",
	'summary': f"Expected EPS: {eps_estimate}" + (f", Reported: {reported_eps}" if reported_eps and reported_eps != 'N/A' else ''),
	'source': 'Yahoo Finance',
	'category': 'earnings',
	'timestamp': datetime.now(),
	'event_date': event_date,
	'url': f"https://finance.yahoo.com/quote/{ticker}",
	'event_type': 'earnings',
	'ticker': ticker,
	'expected_value': self._parse_float(eps_estimate),
	'actual_value': self._parse_float(reported_eps) if reported_eps else None,
	'previous_value': None,
	'impact': 'medium', # Earnings are generally medium impact
	'sentiment': self._determine_earnings_sentiment(eps_estimate, reported_eps),
	'is_breaking': False,
	'source_weight': 1.3,
	'likes': 0,
	'retweets': 0
	})

	except Exception as e:
	logger.debug(f"Error parsing earnings row: {e}")
	continue

	return events if events else self._get_mock_earnings()

	except Exception as e:
	logger.error(f"Error fetching earnings: {e}")
	return self._get_mock_earnings()

	def _fetch_economic_indicators(self) -> List[Dict]:
	"""
	Fetch economic indicators from FRED and other sources
	Uses RSS feeds
	"""
	try:
	events = []

	# FRED Economic Data releases (via RSS - if available)
	# For now, use mock data as FRED RSS is primarily historical data
	# Real implementation would scrape FRED release calendar

	events.extend(self._get_mock_indicators())

	return events

	except Exception as e:
	logger.error(f"Error fetching economic indicators: {e}")
	return self._get_mock_indicators()

	def _fetch_central_bank_events(self) -> List[Dict]:
	"""
	Fetch central bank announcements from RSS feeds
	"""
	events = []

	for bank_id, bank_info in self.CENTRAL_BANKS.items():
	try:
	feed = feedparser.parse(bank_info['rss'])

	for entry in feed.entries[:10]:
	try:
	# Parse timestamp
	if hasattr(entry, 'published_parsed') and entry.published_parsed:
	timestamp = datetime(*entry.published_parsed[:6])
	else:
	timestamp = datetime.now()

	# Skip old events (>7 days)
	if (datetime.now() - timestamp).days > 7:
	continue

	title = entry.get('title', '')
	summary = entry.get('summary', '') or title
	url = entry.get('link', '')

	# Clean HTML from summary
	if summary:
	summary = BeautifulSoup(summary, 'html.parser').get_text()
	summary = summary[:200] + '...' if len(summary) > 200 else summary

	events.append({
	'id': hash(url),
	'title': f"{bank_info['name']}: {title}",
	'summary': summary,
	'source': bank_info['name'],
	'category': 'central_bank',
	'timestamp': timestamp,
	'event_date': timestamp,
	'url': url,
	'event_type': 'central_bank_announcement',
	'ticker': None,
	'expected_value': None,
	'actual_value': None,
	'previous_value': None,
	'impact': 'high', # Central bank events are high impact
	'sentiment': 'neutral',
	'is_breaking': (datetime.now() - timestamp).days < 1,
	'source_weight': bank_info['weight'],
	'likes': 0,
	'retweets': 0
	})

	except Exception as e:
	logger.debug(f"Error parsing {bank_id} entry: {e}")
	continue

	except Exception as e:
	logger.error(f"Error fetching {bank_id} RSS: {e}")

	return events

	def _parse_earnings_date(self, time_str: str) -> datetime:
	"""Parse earnings report time"""
	# Yahoo Finance uses "Before Market Open", "After Market Close", or specific dates
	now = datetime.now()

	if 'Before Market' in time_str or 'BMO' in time_str:
	return now.replace(hour=7, minute=0, second=0, microsecond=0)
	elif 'After Market' in time_str or 'AMC' in time_str:
	return now.replace(hour=16, minute=0, second=0, microsecond=0)
	else:
	# Default to tomorrow morning
	return (now + timedelta(days=1)).replace(hour=7, minute=0, second=0, microsecond=0)

	def _parse_float(self, value_str: str) -> Optional[float]:
	"""Parse float from string"""
	if not value_str or value_str == 'N/A' or value_str == '-':
	return None

	try:
	# Remove $ and other non-numeric characters except . and -
	cleaned = re.sub(r'[^\d.-]', '', value_str)
	return float(cleaned)
	except:
	return None

	def _determine_earnings_sentiment(self, expected: str, actual: Optional[str]) -> str:
	"""Determine sentiment based on earnings beat/miss"""
	if not actual or actual == 'N/A':
	return 'neutral'

	exp_val = self._parse_float(expected)
	act_val = self._parse_float(actual)

	if exp_val is None or act_val is None:
	return 'neutral'

	if act_val > exp_val:
	return 'positive' # Beat
	elif act_val < exp_val:
	return 'negative' # Miss
	else:
	return 'neutral' # In-line

	def _get_mock_earnings(self) -> List[Dict]:
	"""Mock earnings data"""
	now = datetime.now()

	return [
	{
	'id': 1,
	'title': 'Apple Inc. (AAPL) Earnings Report',
	'summary': 'Expected EPS: $2.10',
	'source': 'Yahoo Finance',
	'category': 'earnings',
	'timestamp': now,
	'event_date': now + timedelta(days=2, hours=16),
	'url': 'https://finance.yahoo.com/quote/AAPL',
	'event_type': 'earnings',
	'ticker': 'AAPL',
	'expected_value': 2.10,
	'actual_value': None,
	'previous_value': 1.95,
	'impact': 'high',
	'sentiment': 'neutral',
	'is_breaking': False,
	'source_weight': 1.5,
	'likes': 0,
	'retweets': 0
	},
	{
	'id': 2,
	'title': 'Microsoft Corporation (MSFT) Earnings Report',
	'summary': 'Expected EPS: $2.75',
	'source': 'Yahoo Finance',
	'category': 'earnings',
	'timestamp': now,
	'event_date': now + timedelta(days=3, hours=16),
	'url': 'https://finance.yahoo.com/quote/MSFT',
	'event_type': 'earnings',
	'ticker': 'MSFT',
	'expected_value': 2.75,
	'actual_value': None,
	'previous_value': 2.50,
	'impact': 'high',
	'sentiment': 'neutral',
	'is_breaking': False,
	'source_weight': 1.5,
	'likes': 0,
	'retweets': 0
	}
	]

	def _get_mock_indicators(self) -> List[Dict]:
	"""Mock economic indicator data"""
	now = datetime.now()

	return [
	{
	'id': 3,
	'title': 'US Retail Sales Data Release',
	'summary': 'Monthly retail sales figures',
	'source': 'US Census Bureau',
	'category': 'economic_indicator',
	'timestamp': now,
	'event_date': now + timedelta(days=1, hours=8, minutes=30),
	'url': 'https://www.census.gov/retail/',
	'event_type': 'retail_sales',
	'ticker': None,
	'expected_value': 0.5,
	'actual_value': None,
	'previous_value': 0.3,
	'impact': 'medium',
	'sentiment': 'neutral',
	'is_breaking': False,
	'source_weight': 1.6,
	'likes': 0,
	'retweets': 0
	}
	]

	def _get_mock_events(self) -> List[Dict]:
	"""Combined mock data"""
	return self._get_mock_earnings() + self._get_mock_indicators() + [
	{
	'id': 4,
	'title': 'Federal Reserve: FOMC Meeting Minutes Released',
	'summary': 'Minutes from the latest Federal Open Market Committee meeting',
	'source': 'Federal Reserve',
	'category': 'central_bank',
	'timestamp': datetime.now() - timedelta(hours=2),
	'event_date': datetime.now() - timedelta(hours=2),
	'url': 'https://www.federalreserve.gov/',
	'event_type': 'central_bank_announcement',
	'ticker': None,
	'expected_value': None,
	'actual_value': None,
	'previous_value': None,
	'impact': 'high',
	'sentiment': 'neutral',
	'is_breaking': True,
	'source_weight': 2.0,
	'likes': 0,
	'retweets': 0
	}
	]