""" Sectoral News Scraper - 7 Major Market Sectors Filters and aggregates news by sector: Finance, Tech, Energy, Healthcare, Consumer, Industrials, Real Estate Leverages existing RSS infrastructure with sector-specific classification """ from datetime import datetime, timedelta from typing import List, Dict, Optional import logging import re from concurrent.futures import ThreadPoolExecutor import requests import pandas as pd import feedparser from bs4 import BeautifulSoup # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class SectoralNewsScraper: """ Aggregates news by market sector Uses RSS feeds + keyword classification """ # 7 Sector configuration with keywords and RSS feeds SECTORS = { 'finance': { 'name': 'Finance', 'keywords': [ 'bank', 'JPMorgan', 'Goldman Sachs', 'Morgan Stanley', 'Wells Fargo', 'Citigroup', 'Bank of America', 'fintech', 'lending', 'credit', 'financial sector', 'banking', 'insurance', 'asset management' ], 'rss_sources': [ 'https://www.cnbc.com/id/10000664/device/rss/rss.html', # CNBC Banking 'https://feeds.bloomberg.com/markets/news.rss' ], 'weight': 1.5 }, 'tech': { 'name': 'Technology', 'keywords': [ 'Apple', 'Microsoft', 'Google', 'Alphabet', 'Amazon', 'Meta', 'Facebook', 'NVIDIA', 'AMD', 'Intel', 'semiconductor', 'chip', 'software', 'cloud', 'AI', 'artificial intelligence', 'tech sector', 'Silicon Valley', 'Tesla' ], 'rss_sources': [ 'https://www.cnbc.com/id/19854910/device/rss/rss.html', # CNBC Technology 'https://techcrunch.com/feed/' ], 'weight': 1.5 }, 'energy': { 'name': 'Energy', 'keywords': [ 'oil', 'gas', 'crude', 'petroleum', 'OPEC', 'Exxon', 'ExxonMobil', 'Chevron', 'ConocoPhillips', 'renewable', 'solar', 'wind', 'energy sector', 'pipeline', 'natural gas', 'LNG', 'fracking', 'drilling' ], 'rss_sources': [ 'https://www.cnbc.com/id/19832390/device/rss/rss.html', # CNBC Energy ], 'weight': 1.6 }, 'healthcare': { 'name': 'Healthcare', 'keywords': [ 'pharma', 'pharmaceutical', 'biotech', 'FDA', 'drug', 'vaccine', 'clinical trial', 'Pfizer', 'Johnson & Johnson', 'Merck', 'AbbVie', 'Bristol Myers', 'healthcare', 'hospital', 'medical device', 'therapeutics' ], 'rss_sources': [ 'https://www.cnbc.com/id/10000108/device/rss/rss.html', # CNBC Health ], 'weight': 1.5 }, 'consumer': { 'name': 'Consumer & Retail', 'keywords': [ 'retail', 'Amazon', 'Walmart', 'Target', 'Costco', 'Home Depot', 'e-commerce', 'consumer', 'shopping', 'Black Friday', 'sales', 'Nike', 'Starbucks', 'McDonald\'s', 'consumer goods', 'discretionary' ], 'rss_sources': [ 'https://www.cnbc.com/id/10001009/device/rss/rss.html', # CNBC Retail ], 'weight': 1.3 }, 'industrials': { 'name': 'Industrials', 'keywords': [ 'Boeing', 'Airbus', 'Caterpillar', 'Deere', '3M', 'GE', 'General Electric', 'Honeywell', 'Lockheed Martin', 'manufacturing', 'industrial', 'aerospace', 'defense', 'machinery', 'equipment', 'logistics', 'freight' ], 'rss_sources': [ 'https://www.reuters.com/rss/businessNews', # Reuters Business ], 'weight': 1.4 }, 'real_estate': { 'name': 'Real Estate', 'keywords': [ 'housing', 'mortgage', 'REIT', 'real estate', 'property', 'home sales', 'construction', 'residential', 'commercial real estate', 'housing market', 'home prices', 'rent', 'rental', 'builder', 'homebuilder' ], 'rss_sources': [], # Will rely on keyword filtering from general news 'weight': 1.3 } } def __init__(self): """Initialize scraper""" self.session = requests.Session() self.session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.9', }) def scrape_sectoral_news(self, max_items: int = 50, hours: int = 24) -> List[Dict]: """ Scrape and classify news by sector Returns aggregated list sorted by sector and timestamp """ all_news = [] seen_urls = set() # Parallel fetch from all sector RSS feeds with ThreadPoolExecutor(max_workers=7) as executor: futures = [] for sector_id, sector_info in self.SECTORS.items(): # Submit RSS fetching task for each sector futures.append(( executor.submit(self._fetch_sector_news, sector_id, sector_info, hours), sector_id )) for future, sector_id in futures: try: sector_news = future.result(timeout=35) # Deduplicate by URL for item in sector_news: if item['url'] not in seen_urls: seen_urls.add(item['url']) all_news.append(item) logger.info(f"Fetched {len(sector_news)} items for {sector_id}") except Exception as e: logger.error(f"Error fetching {sector_id} news: {e}") # If no news fetched, use mock data if not all_news: logger.warning("No sectoral news fetched - using mock data") return self._get_mock_sectoral_news() # Sort by sector priority and timestamp all_news.sort( key=lambda x: (x['sector'] != 'tech', x['sector'] != 'finance', -x['timestamp'].timestamp()), ) return all_news[:max_items] def _fetch_sector_news(self, sector_id: str, sector_info: Dict, hours: int) -> List[Dict]: """Fetch news for a specific sector""" sector_news = [] # Fetch from sector-specific RSS feeds for rss_url in sector_info['rss_sources']: try: feed_news = self._fetch_rss_feed(rss_url, sector_id, sector_info, hours) sector_news.extend(feed_news) except Exception as e: logger.debug(f"Error fetching RSS {rss_url}: {e}") # If no RSS news, could also filter general news sources by keywords # (This would require access to FinanceNewsScraper - skipping for now) return sector_news def _fetch_rss_feed(self, rss_url: str, sector_id: str, sector_info: Dict, hours: int) -> List[Dict]: """Fetch and parse RSS feed for sector""" try: feed = feedparser.parse(rss_url) if not feed.entries: return [] news_items = [] cutoff_time = datetime.now() - timedelta(hours=hours) for entry in feed.entries[:15]: # Limit to 15 per feed try: # Parse timestamp if hasattr(entry, 'published_parsed') and entry.published_parsed: timestamp = datetime(*entry.published_parsed[:6]) elif hasattr(entry, 'updated_parsed') and entry.updated_parsed: timestamp = datetime(*entry.updated_parsed[:6]) else: timestamp = datetime.now() # Skip old news if timestamp < cutoff_time: continue # Extract title and summary title = entry.get('title', '') summary = entry.get('summary', '') or entry.get('description', '') # Clean HTML from summary if summary: summary = BeautifulSoup(summary, 'html.parser').get_text() summary = summary[:200] + '...' if len(summary) > 200 else summary url = entry.get('link', '') # Verify sector relevance by keywords text = f"{title} {summary}".lower() keyword_matches = sum(1 for kw in sector_info['keywords'] if kw.lower() in text) # Skip if not relevant enough (unless from sector-specific feed) if keyword_matches == 0 and len(sector_info['rss_sources']) > 3: continue # Categorize and analyze category = self._categorize_news(text) sentiment = self._analyze_sentiment(text) impact = self._assess_impact(sector_info['weight'], keyword_matches) news_items.append({ 'id': hash(url), 'title': title, 'summary': summary or title[:200], 'source': sector_info['name'], 'sector': sector_id, # Add sector field 'category': category, 'timestamp': timestamp, 'sentiment': sentiment, 'impact': impact, 'url': url, 'likes': 0, 'retweets': 0, 'is_breaking': False, 'source_weight': sector_info['weight'], 'from_web': False }) except Exception as e: logger.debug(f"Error parsing RSS entry: {e}") continue return news_items except Exception as e: logger.error(f"Error fetching RSS feed {rss_url}: {e}") return [] def _categorize_news(self, text: str) -> str: """Categorize news (macro, markets, geopolitical)""" macro_keywords = ['Fed', 'ECB', 'inflation', 'rate', 'GDP', 'economy', 'recession'] markets_keywords = ['stock', 'earnings', 'revenue', 'profit', 'IPO', 'merger', 'acquisition'] geo_keywords = ['China', 'tariff', 'trade war', 'sanctions', 'regulation'] macro_score = sum(1 for kw in macro_keywords if kw.lower() in text) markets_score = sum(1 for kw in markets_keywords if kw.lower() in text) geo_score = sum(1 for kw in geo_keywords if kw.lower() in text) scores = {'macro': macro_score, 'markets': markets_score, 'geopolitical': geo_score} return max(scores, key=scores.get) if max(scores.values()) > 0 else 'markets' def _analyze_sentiment(self, text: str) -> str: """Analyze sentiment based on keywords""" positive = ['surge', 'soar', 'rally', 'beat', 'upgrade', 'gain', 'rise', 'bullish', 'positive'] negative = ['plunge', 'crash', 'fall', 'miss', 'downgrade', 'loss', 'drop', 'bearish', 'negative'] pos_count = sum(1 for word in positive if word in text) neg_count = sum(1 for word in negative if word in text) if pos_count > neg_count: return 'positive' elif neg_count > pos_count: return 'negative' return 'neutral' def _assess_impact(self, sector_weight: float, keyword_matches: int) -> str: """Assess impact based on sector weight and keyword relevance""" if sector_weight >= 1.5 and keyword_matches >= 3: return 'high' elif keyword_matches >= 2: return 'medium' else: return 'low' def _get_mock_sectoral_news(self) -> List[Dict]: """Mock sectoral news for development""" now = datetime.now() return [ { 'id': 1, 'title': 'Apple announces new iPhone with advanced AI capabilities', 'summary': 'Apple unveils next-generation iPhone featuring on-device AI processing', 'source': 'Technology', 'sector': 'tech', 'category': 'markets', 'timestamp': now - timedelta(minutes=30), 'sentiment': 'positive', 'impact': 'high', 'url': 'https://techcrunch.com', 'likes': 0, 'retweets': 0, 'is_breaking': False, 'source_weight': 1.5, 'from_web': False }, { 'id': 2, 'title': 'JPMorgan reports strong Q4 earnings beat analyst expectations', 'summary': 'Major investment bank posts record profits amid trading surge', 'source': 'Finance', 'sector': 'finance', 'category': 'markets', 'timestamp': now - timedelta(hours=1), 'sentiment': 'positive', 'impact': 'high', 'url': 'https://cnbc.com', 'likes': 0, 'retweets': 0, 'is_breaking': False, 'source_weight': 1.5, 'from_web': False }, { 'id': 3, 'title': 'OPEC+ extends oil production cuts through Q2', 'summary': 'Major oil producers agree to maintain supply restrictions', 'source': 'Energy', 'sector': 'energy', 'category': 'geopolitical', 'timestamp': now - timedelta(hours=2), 'sentiment': 'neutral', 'impact': 'high', 'url': 'https://reuters.com', 'likes': 0, 'retweets': 0, 'is_breaking': False, 'source_weight': 1.6, 'from_web': False }, { 'id': 4, 'title': 'Pfizer receives FDA approval for new cancer treatment', 'summary': 'Breakthrough therapy approved for late-stage lung cancer', 'source': 'Healthcare', 'sector': 'healthcare', 'category': 'markets', 'timestamp': now - timedelta(hours=3), 'sentiment': 'positive', 'impact': 'medium', 'url': 'https://cnbc.com', 'likes': 0, 'retweets': 0, 'is_breaking': False, 'source_weight': 1.5, 'from_web': False }, { 'id': 5, 'title': 'Amazon expands same-day delivery to 50 new cities', 'summary': 'E-commerce giant accelerates logistics network expansion', 'source': 'Consumer & Retail', 'sector': 'consumer', 'category': 'markets', 'timestamp': now - timedelta(hours=4), 'sentiment': 'positive', 'impact': 'medium', 'url': 'https://techcrunch.com', 'likes': 0, 'retweets': 0, 'is_breaking': False, 'source_weight': 1.3, 'from_web': False }, { 'id': 6, 'title': 'Boeing wins $10B contract for new military aircraft', 'summary': 'Defense contractor secures major government order', 'source': 'Industrials', 'sector': 'industrials', 'category': 'markets', 'timestamp': now - timedelta(hours=5), 'sentiment': 'positive', 'impact': 'medium', 'url': 'https://reuters.com', 'likes': 0, 'retweets': 0, 'is_breaking': False, 'source_weight': 1.4, 'from_web': False }, { 'id': 7, 'title': 'US housing starts surge 15% in December', 'summary': 'Construction activity rebounds amid lower mortgage rates', 'source': 'Real Estate', 'sector': 'real_estate', 'category': 'macro', 'timestamp': now - timedelta(hours=6), 'sentiment': 'positive', 'impact': 'medium', 'url': 'https://cnbc.com', 'likes': 0, 'retweets': 0, 'is_breaking': False, 'source_weight': 1.3, 'from_web': False } ]