| """ |
| Reddit Financial News Scraper |
| Scrapes financial, trading, quant, and geopolitical news from Reddit |
| No authentication required - uses public RSS feeds |
| """ |
|
|
| import feedparser |
| import logging |
| from datetime import datetime, timedelta |
| from typing import List, Dict |
| import re |
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| class RedditFinanceMonitor: |
| """ |
| Reddit financial news aggregator using RSS feeds |
| No authentication required - public RSS feeds only |
| """ |
|
|
| |
| SUBREDDITS = { |
| |
| 'wallstreetbets': { |
| 'url': 'https://www.reddit.com/r/wallstreetbets/top/.rss?t=day', |
| 'weight': 1.6, |
| 'specialization': ['markets'], |
| 'category': 'markets' |
| }, |
| 'stocks': { |
| 'url': 'https://www.reddit.com/r/stocks/top/.rss?t=day', |
| 'weight': 1.7, |
| 'specialization': ['markets'], |
| 'category': 'markets' |
| }, |
| 'investing': { |
| 'url': 'https://www.reddit.com/r/investing/top/.rss?t=day', |
| 'weight': 1.8, |
| 'specialization': ['markets', 'macro'], |
| 'category': 'markets' |
| }, |
| 'stockmarket': { |
| 'url': 'https://www.reddit.com/r/StockMarket/top/.rss?t=day', |
| 'weight': 1.6, |
| 'specialization': ['markets'], |
| 'category': 'markets' |
| }, |
| 'options': { |
| 'url': 'https://www.reddit.com/r/options/top/.rss?t=day', |
| 'weight': 1.5, |
| 'specialization': ['markets'], |
| 'category': 'markets' |
| }, |
| 'daytrading': { |
| 'url': 'https://www.reddit.com/r/Daytrading/top/.rss?t=day', |
| 'weight': 1.5, |
| 'specialization': ['markets'], |
| 'category': 'markets' |
| }, |
| 'securityanalysis': { |
| 'url': 'https://www.reddit.com/r/SecurityAnalysis/top/.rss?t=day', |
| 'weight': 1.7, |
| 'specialization': ['markets'], |
| 'category': 'markets' |
| }, |
|
|
| |
| 'economics': { |
| 'url': 'https://www.reddit.com/r/Economics/top/.rss?t=day', |
| 'weight': 1.8, |
| 'specialization': ['macro'], |
| 'category': 'macro' |
| }, |
| 'economy': { |
| 'url': 'https://www.reddit.com/r/economy/top/.rss?t=day', |
| 'weight': 1.6, |
| 'specialization': ['macro'], |
| 'category': 'macro' |
| }, |
|
|
| |
| 'algotrading': { |
| 'url': 'https://www.reddit.com/r/algotrading/top/.rss?t=day', |
| 'weight': 1.7, |
| 'specialization': ['markets'], |
| 'category': 'markets' |
| }, |
| 'quantfinance': { |
| 'url': 'https://www.reddit.com/r/quant/top/.rss?t=day', |
| 'weight': 1.7, |
| 'specialization': ['markets'], |
| 'category': 'markets' |
| }, |
|
|
| |
| 'geopolitics': { |
| 'url': 'https://www.reddit.com/r/geopolitics/top/.rss?t=day', |
| 'weight': 1.8, |
| 'specialization': ['geopolitical'], |
| 'category': 'geopolitical' |
| }, |
| 'worldnews': { |
| 'url': 'https://www.reddit.com/r/worldnews/top/.rss?t=day', |
| 'weight': 1.7, |
| 'specialization': ['geopolitical'], |
| 'category': 'geopolitical' |
| }, |
| 'neutralpolitics': { |
| 'url': 'https://www.reddit.com/r/NeutralPolitics/top/.rss?t=day', |
| 'weight': 1.6, |
| 'specialization': ['geopolitical'], |
| 'category': 'geopolitical' |
| }, |
| } |
|
|
| |
| MACRO_KEYWORDS = [ |
| 'Fed', 'ECB', 'BoE', 'BoJ', 'FOMC', 'Powell', 'Lagarde', |
| 'interest rate', 'inflation', 'CPI', 'PPI', 'GDP', |
| 'unemployment', 'jobs report', 'NFP', 'central bank', |
| 'recession', 'QE', 'quantitative easing', 'monetary policy' |
| ] |
|
|
| MARKETS_KEYWORDS = [ |
| 'stock', 'equity', 'bond', 'commodity', 'oil', 'gold', |
| 'earnings', 'revenue', 'profit', 'IPO', 'merger', |
| 'acquisition', 'trading', 'options', 'futures', 'forex' |
| ] |
|
|
| GEOPOLITICAL_KEYWORDS = [ |
| 'war', 'conflict', 'sanction', 'trade', 'tariff', |
| 'election', 'China', 'Russia', 'Ukraine', 'Taiwan', |
| 'Middle East', 'Iran', 'Israel', 'NATO', 'UN' |
| ] |
|
|
| def __init__(self): |
| """Initialize Reddit monitor""" |
| pass |
|
|
| def _categorize_post(self, title: str, subreddit_info: Dict) -> str: |
| """Categorize post based on title and subreddit""" |
| title_lower = title.lower() |
|
|
| |
| default_category = subreddit_info.get('category', 'markets') |
|
|
| |
| if any(keyword.lower() in title_lower for keyword in self.MACRO_KEYWORDS): |
| return 'macro' |
| elif any(keyword.lower() in title_lower for keyword in self.GEOPOLITICAL_KEYWORDS): |
| return 'geopolitical' |
| elif any(keyword.lower() in title_lower for keyword in self.MARKETS_KEYWORDS): |
| return 'markets' |
|
|
| return default_category |
|
|
| def _detect_sentiment(self, title: str) -> str: |
| """Simple sentiment detection based on keywords""" |
| title_lower = title.lower() |
|
|
| positive_words = ['bullish', 'bull', 'surge', 'gain', 'up', 'rally', 'boom', 'profit', 'growth'] |
| negative_words = ['bearish', 'bear', 'crash', 'loss', 'down', 'fall', 'decline', 'recession', 'crisis'] |
|
|
| positive_count = sum(1 for word in positive_words if word in title_lower) |
| negative_count = sum(1 for word in negative_words if word in title_lower) |
|
|
| if positive_count > negative_count: |
| return 'positive' |
| elif negative_count > positive_count: |
| return 'negative' |
| else: |
| return 'neutral' |
|
|
| def _calculate_impact(self, score: int, num_comments: int, subreddit_weight: float) -> str: |
| """Calculate impact based on upvotes, comments, and subreddit weight""" |
| |
| engagement_score = (score * 0.7) + (num_comments * 0.3) |
| weighted_score = engagement_score * subreddit_weight |
|
|
| if weighted_score > 500: |
| return 'high' |
| elif weighted_score > 100: |
| return 'medium' |
| else: |
| return 'low' |
|
|
| def scrape_reddit_news(self, max_posts: int = 100, hours: int = 12) -> List[Dict]: |
| """ |
| Scrape Reddit posts from financial subreddits |
| |
| Args: |
| max_posts: Maximum number of posts to return |
| hours: Only include posts from the last N hours (default: 12) |
| |
| Returns: |
| List of news items with metadata |
| """ |
| all_posts = [] |
| seen_titles = set() |
| cutoff_time = datetime.now() - timedelta(hours=hours) |
|
|
| logger.info(f"Scraping Reddit posts from last {hours} hours...") |
|
|
| for subreddit_name, subreddit_info in self.SUBREDDITS.items(): |
| try: |
| logger.info(f"Fetching r/{subreddit_name}...") |
|
|
| |
| feed = feedparser.parse(subreddit_info['url']) |
|
|
| for entry in feed.entries[:20]: |
| try: |
| |
| if hasattr(entry, 'published_parsed'): |
| pub_date = datetime(*entry.published_parsed[:6]) |
| else: |
| pub_date = datetime.now() |
|
|
| |
| if pub_date < cutoff_time: |
| continue |
|
|
| |
| title = entry.title.strip() |
| link = entry.link |
|
|
| |
| title_hash = hash(title[:100]) |
| if title_hash in seen_titles: |
| continue |
| seen_titles.add(title_hash) |
|
|
| |
| score = 0 |
| num_comments = 0 |
| if hasattr(entry, 'content'): |
| content_text = entry.content[0].value if entry.content else '' |
| |
| score_match = re.search(r'(\d+)\s+points?', content_text) |
| if score_match: |
| score = int(score_match.group(1)) |
| |
| comment_match = re.search(r'(\d+)\s+comments?', content_text) |
| if comment_match: |
| num_comments = int(comment_match.group(1)) |
|
|
| |
| category = self._categorize_post(title, subreddit_info) |
| sentiment = self._detect_sentiment(title) |
| impact = self._calculate_impact(score, num_comments, subreddit_info['weight']) |
|
|
| |
| is_breaking = ( |
| (datetime.now() - pub_date).total_seconds() < 10800 and |
| score > 1000 |
| ) |
|
|
| post_data = { |
| 'title': title, |
| 'summary': title, |
| 'url': link, |
| 'source': f"r/{subreddit_name}", |
| 'timestamp': pub_date, |
| 'category': category, |
| 'sentiment': sentiment, |
| 'impact': impact, |
| 'is_breaking': is_breaking, |
| 'engagement': { |
| 'score': score, |
| 'comments': num_comments |
| }, |
| 'platform': 'reddit' |
| } |
|
|
| all_posts.append(post_data) |
|
|
| except Exception as e: |
| logger.error(f"Error processing entry from r/{subreddit_name}: {e}") |
| continue |
|
|
| logger.info(f"Fetched {len([p for p in all_posts if p['source'] == f'r/{subreddit_name}'])} posts from r/{subreddit_name}") |
|
|
| except Exception as e: |
| logger.error(f"Error fetching r/{subreddit_name}: {e}") |
| continue |
|
|
| |
| all_posts.sort(key=lambda x: x['engagement']['score'] * self.SUBREDDITS.get( |
| x['source'].replace('r/', ''), {} |
| ).get('weight', 1.0), reverse=True) |
|
|
| logger.info(f"Total Reddit posts scraped: {len(all_posts)}") |
|
|
| return all_posts[:max_posts] |
|
|
| def get_statistics(self) -> Dict: |
| """ |
| Get statistics about scraped Reddit posts |
| Note: Statistics are now managed by NewsCacheManager |
| This method returns empty stats for backward compatibility |
| """ |
| return { |
| 'total': 0, |
| 'high_impact': 0, |
| 'breaking': 0, |
| 'by_category': { |
| 'macro': 0, |
| 'markets': 0, |
| 'geopolitical': 0 |
| } |
| } |
|
|