Spaces:

ResearchEngineering
/

FinancialPlatform

Paused

FinancialPlatform / app /services /reddit_news.py

Dmitry Beresnev

fix news stats calcualtion

0e4b579 3 months ago

11.7 kB

	"""
	Reddit Financial News Scraper
	Scrapes financial, trading, quant, and geopolitical news from Reddit
	No authentication required - uses public RSS feeds
	"""

	import feedparser
	import logging
	from datetime import datetime, timedelta
	from typing import List, Dict
	import re

	logger = logging.getLogger(__name__)


	class RedditFinanceMonitor:
	"""
	Reddit financial news aggregator using RSS feeds
	No authentication required - public RSS feeds only
	"""

	# Premium financial subreddits
	SUBREDDITS = {
	# Financial & Markets
	'wallstreetbets': {
	'url': 'https://www.reddit.com/r/wallstreetbets/top/.rss?t=day',
	'weight': 1.6,
	'specialization': ['markets'],
	'category': 'markets'
	},
	'stocks': {
	'url': 'https://www.reddit.com/r/stocks/top/.rss?t=day',
	'weight': 1.7,
	'specialization': ['markets'],
	'category': 'markets'
	},
	'investing': {
	'url': 'https://www.reddit.com/r/investing/top/.rss?t=day',
	'weight': 1.8,
	'specialization': ['markets', 'macro'],
	'category': 'markets'
	},
	'stockmarket': {
	'url': 'https://www.reddit.com/r/StockMarket/top/.rss?t=day',
	'weight': 1.6,
	'specialization': ['markets'],
	'category': 'markets'
	},
	'options': {
	'url': 'https://www.reddit.com/r/options/top/.rss?t=day',
	'weight': 1.5,
	'specialization': ['markets'],
	'category': 'markets'
	},
	'daytrading': {
	'url': 'https://www.reddit.com/r/Daytrading/top/.rss?t=day',
	'weight': 1.5,
	'specialization': ['markets'],
	'category': 'markets'
	},
	'securityanalysis': {
	'url': 'https://www.reddit.com/r/SecurityAnalysis/top/.rss?t=day',
	'weight': 1.7,
	'specialization': ['markets'],
	'category': 'markets'
	},

	# Economics & Macro
	'economics': {
	'url': 'https://www.reddit.com/r/Economics/top/.rss?t=day',
	'weight': 1.8,
	'specialization': ['macro'],
	'category': 'macro'
	},
	'economy': {
	'url': 'https://www.reddit.com/r/economy/top/.rss?t=day',
	'weight': 1.6,
	'specialization': ['macro'],
	'category': 'macro'
	},

	# Quantitative Finance
	'algotrading': {
	'url': 'https://www.reddit.com/r/algotrading/top/.rss?t=day',
	'weight': 1.7,
	'specialization': ['markets'],
	'category': 'markets'
	},
	'quantfinance': {
	'url': 'https://www.reddit.com/r/quant/top/.rss?t=day',
	'weight': 1.7,
	'specialization': ['markets'],
	'category': 'markets'
	},

	# Geopolitics
	'geopolitics': {
	'url': 'https://www.reddit.com/r/geopolitics/top/.rss?t=day',
	'weight': 1.8,
	'specialization': ['geopolitical'],
	'category': 'geopolitical'
	},
	'worldnews': {
	'url': 'https://www.reddit.com/r/worldnews/top/.rss?t=day',
	'weight': 1.7,
	'specialization': ['geopolitical'],
	'category': 'geopolitical'
	},
	'neutralpolitics': {
	'url': 'https://www.reddit.com/r/NeutralPolitics/top/.rss?t=day',
	'weight': 1.6,
	'specialization': ['geopolitical'],
	'category': 'geopolitical'
	},
	}

	# Keyword detection for additional categorization
	MACRO_KEYWORDS = [
	'Fed', 'ECB', 'BoE', 'BoJ', 'FOMC', 'Powell', 'Lagarde',
	'interest rate', 'inflation', 'CPI', 'PPI', 'GDP',
	'unemployment', 'jobs report', 'NFP', 'central bank',
	'recession', 'QE', 'quantitative easing', 'monetary policy'
	]

	MARKETS_KEYWORDS = [
	'stock', 'equity', 'bond', 'commodity', 'oil', 'gold',
	'earnings', 'revenue', 'profit', 'IPO', 'merger',
	'acquisition', 'trading', 'options', 'futures', 'forex'
	]

	GEOPOLITICAL_KEYWORDS = [
	'war', 'conflict', 'sanction', 'trade', 'tariff',
	'election', 'China', 'Russia', 'Ukraine', 'Taiwan',
	'Middle East', 'Iran', 'Israel', 'NATO', 'UN'
	]

	def __init__(self):
	"""Initialize Reddit monitor"""
	pass

	def _categorize_post(self, title: str, subreddit_info: Dict) -> str:
	"""Categorize post based on title and subreddit"""
	title_lower = title.lower()

	# Use subreddit default category
	default_category = subreddit_info.get('category', 'markets')

	# Check keywords for override
	if any(keyword.lower() in title_lower for keyword in self.MACRO_KEYWORDS):
	return 'macro'
	elif any(keyword.lower() in title_lower for keyword in self.GEOPOLITICAL_KEYWORDS):
	return 'geopolitical'
	elif any(keyword.lower() in title_lower for keyword in self.MARKETS_KEYWORDS):
	return 'markets'

	return default_category

	def _detect_sentiment(self, title: str) -> str:
	"""Simple sentiment detection based on keywords"""
	title_lower = title.lower()

	positive_words = ['bullish', 'bull', 'surge', 'gain', 'up', 'rally', 'boom', 'profit', 'growth']
	negative_words = ['bearish', 'bear', 'crash', 'loss', 'down', 'fall', 'decline', 'recession', 'crisis']

	positive_count = sum(1 for word in positive_words if word in title_lower)
	negative_count = sum(1 for word in negative_words if word in title_lower)

	if positive_count > negative_count:
	return 'positive'
	elif negative_count > positive_count:
	return 'negative'
	else:
	return 'neutral'

	def _calculate_impact(self, score: int, num_comments: int, subreddit_weight: float) -> str:
	"""Calculate impact based on upvotes, comments, and subreddit weight"""
	# Normalize score (upvotes - downvotes)
	engagement_score = (score * 0.7) + (num_comments * 0.3)
	weighted_score = engagement_score * subreddit_weight

	if weighted_score > 500:
	return 'high'
	elif weighted_score > 100:
	return 'medium'
	else:
	return 'low'

	def scrape_reddit_news(self, max_posts: int = 100, hours: int = 12) -> List[Dict]:
	"""
	Scrape Reddit posts from financial subreddits

	Args:
	max_posts: Maximum number of posts to return
	hours: Only include posts from the last N hours (default: 12)

	Returns:
	List of news items with metadata
	"""
	all_posts = []
	seen_titles = set()
	cutoff_time = datetime.now() - timedelta(hours=hours)

	logger.info(f"Scraping Reddit posts from last {hours} hours...")

	for subreddit_name, subreddit_info in self.SUBREDDITS.items():
	try:
	logger.info(f"Fetching r/{subreddit_name}...")

	# Parse RSS feed
	feed = feedparser.parse(subreddit_info['url'])

	for entry in feed.entries[:20]: # Get top 20 per subreddit
	try:
	# Parse publication date
	if hasattr(entry, 'published_parsed'):
	pub_date = datetime(*entry.published_parsed[:6])
	else:
	pub_date = datetime.now()

	# Filter by time (last 12 hours by default)
	if pub_date < cutoff_time:
	continue

	# Extract title and link
	title = entry.title.strip()
	link = entry.link

	# Deduplicate
	title_hash = hash(title[:100])
	if title_hash in seen_titles:
	continue
	seen_titles.add(title_hash)

	# Extract score and comments from content
	score = 0
	num_comments = 0
	if hasattr(entry, 'content'):
	content_text = entry.content[0].value if entry.content else ''
	# Try to extract score from content
	score_match = re.search(r'(\d+)\s+points?', content_text)
	if score_match:
	score = int(score_match.group(1))
	# Try to extract comments
	comment_match = re.search(r'(\d+)\s+comments?', content_text)
	if comment_match:
	num_comments = int(comment_match.group(1))

	# Categorize and analyze
	category = self._categorize_post(title, subreddit_info)
	sentiment = self._detect_sentiment(title)
	impact = self._calculate_impact(score, num_comments, subreddit_info['weight'])

	# Check if breaking news (high score in last 3 hours)
	is_breaking = (
	(datetime.now() - pub_date).total_seconds() < 10800 and # 3 hours
	score > 1000
	)

	post_data = {
	'title': title,
	'summary': title, # Reddit posts don't have separate summaries
	'url': link,
	'source': f"r/{subreddit_name}",
	'timestamp': pub_date,
	'category': category,
	'sentiment': sentiment,
	'impact': impact,
	'is_breaking': is_breaking,
	'engagement': {
	'score': score,
	'comments': num_comments
	},
	'platform': 'reddit'
	}

	all_posts.append(post_data)

	except Exception as e:
	logger.error(f"Error processing entry from r/{subreddit_name}: {e}")
	continue

	logger.info(f"Fetched {len([p for p in all_posts if p['source'] == f'r/{subreddit_name}'])} posts from r/{subreddit_name}")

	except Exception as e:
	logger.error(f"Error fetching r/{subreddit_name}: {e}")
	continue

	# Sort by engagement score (weighted by source weight)
	all_posts.sort(key=lambda x: x['engagement']['score'] * self.SUBREDDITS.get(
	x['source'].replace('r/', ''), {}
	).get('weight', 1.0), reverse=True)

	logger.info(f"Total Reddit posts scraped: {len(all_posts)}")

	return all_posts[:max_posts]

	def get_statistics(self) -> Dict:
	"""
	Get statistics about scraped Reddit posts
	Note: Statistics are now managed by NewsCacheManager
	This method returns empty stats for backward compatibility
	"""
	return {
	'total': 0,
	'high_impact': 0,
	'breaking': 0,
	'by_category': {
	'macro': 0,
	'markets': 0,
	'geopolitical': 0
	}
	}