| """ |
| AI & Tech News Scraper |
| Fetches news from popular tech resources and big tech company blogs |
| """ |
|
|
| import feedparser |
| import requests |
| from bs4 import BeautifulSoup |
| from datetime import datetime, timedelta |
| from typing import List, Dict |
| import logging |
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| class AITechNewsScraper: |
| """Scraper for AI and tech news from major sources and company blogs""" |
|
|
| |
| SOURCES = { |
| |
| 'TechCrunch AI': { |
| 'url': 'https://techcrunch.com/category/artificial-intelligence/feed/', |
| 'type': 'rss', |
| 'category': 'ai' |
| }, |
| 'The Verge AI': { |
| 'url': 'https://www.theverge.com/ai-artificial-intelligence/rss/index.xml', |
| 'type': 'rss', |
| 'category': 'ai' |
| }, |
| 'VentureBeat AI': { |
| 'url': 'https://venturebeat.com/category/ai/feed/', |
| 'type': 'rss', |
| 'category': 'ai' |
| }, |
| 'MIT Technology Review AI': { |
| 'url': 'https://www.technologyreview.com/topic/artificial-intelligence/feed', |
| 'type': 'rss', |
| 'category': 'ai' |
| }, |
| 'Ars Technica AI': { |
| 'url': 'https://feeds.arstechnica.com/arstechnica/technology-lab', |
| 'type': 'rss', |
| 'category': 'tech' |
| }, |
| 'Wired AI': { |
| 'url': 'https://www.wired.com/feed/tag/ai/latest/rss', |
| 'type': 'rss', |
| 'category': 'ai' |
| }, |
|
|
| |
| 'OpenAI Blog': { |
| 'url': 'https://openai.com/blog/rss.xml', |
| 'type': 'rss', |
| 'category': 'ai' |
| }, |
| 'Google AI Blog': { |
| 'url': 'https://blog.google/technology/ai/rss/', |
| 'type': 'rss', |
| 'category': 'ai' |
| }, |
| 'Microsoft AI Blog': { |
| 'url': 'https://blogs.microsoft.com/ai/feed/', |
| 'type': 'rss', |
| 'category': 'ai' |
| }, |
| 'Meta AI Blog': { |
| 'url': 'https://ai.meta.com/blog/rss/', |
| 'type': 'rss', |
| 'category': 'ai' |
| }, |
| 'DeepMind Blog': { |
| 'url': 'https://deepmind.google/blog/rss.xml', |
| 'type': 'rss', |
| 'category': 'ai' |
| }, |
| 'Anthropic News': { |
| 'url': 'https://www.anthropic.com/news/rss.xml', |
| 'type': 'rss', |
| 'category': 'ai' |
| }, |
| 'AWS AI Blog': { |
| 'url': 'https://aws.amazon.com/blogs/machine-learning/feed/', |
| 'type': 'rss', |
| 'category': 'ai' |
| }, |
| 'NVIDIA AI Blog': { |
| 'url': 'https://blogs.nvidia.com/feed/', |
| 'type': 'rss', |
| 'category': 'ai' |
| }, |
|
|
| |
| 'Stanford HAI': { |
| 'url': 'https://hai.stanford.edu/news/rss.xml', |
| 'type': 'rss', |
| 'category': 'research' |
| }, |
| 'Berkeley AI Research': { |
| 'url': 'https://bair.berkeley.edu/blog/feed.xml', |
| 'type': 'rss', |
| 'category': 'research' |
| }, |
| } |
|
|
| def __init__(self): |
| """Initialize the AI/Tech news scraper""" |
| self.session = requests.Session() |
| self.session.headers.update({ |
| 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36' |
| }) |
|
|
| def scrape_ai_tech_news(self, max_items: int = 100, hours: int = 48) -> List[Dict]: |
| """ |
| Scrape AI and tech news from all sources |
| |
| Args: |
| max_items: Maximum number of news items to return |
| hours: Only include news from the last N hours |
| |
| Returns: |
| List of news items with standardized format |
| """ |
| all_news = [] |
| cutoff_time = datetime.now() - timedelta(hours=hours) |
|
|
| for source_name, source_config in self.SOURCES.items(): |
| try: |
| if source_config['type'] == 'rss': |
| news_items = self._scrape_rss_feed( |
| source_name, |
| source_config['url'], |
| source_config['category'], |
| cutoff_time |
| ) |
| all_news.extend(news_items) |
| logger.info(f"Scraped {len(news_items)} items from {source_name}") |
|
|
| except Exception as e: |
| logger.error(f"Error scraping {source_name}: {e}") |
| continue |
|
|
| |
| all_news.sort(key=lambda x: x['timestamp'], reverse=True) |
|
|
| |
| return all_news[:max_items] |
|
|
| def _scrape_rss_feed(self, source_name: str, feed_url: str, |
| category: str, cutoff_time: datetime) -> List[Dict]: |
| """Scrape a single RSS feed""" |
| news_items = [] |
|
|
| try: |
| feed = feedparser.parse(feed_url) |
|
|
| for entry in feed.entries: |
| try: |
| |
| if hasattr(entry, 'published_parsed') and entry.published_parsed: |
| timestamp = datetime(*entry.published_parsed[:6]) |
| elif hasattr(entry, 'updated_parsed') and entry.updated_parsed: |
| timestamp = datetime(*entry.updated_parsed[:6]) |
| else: |
| timestamp = datetime.now() |
|
|
| |
| if timestamp < cutoff_time: |
| continue |
|
|
| |
| title = entry.get('title', 'No title') |
| summary = entry.get('summary', entry.get('description', '')) |
|
|
| |
| if summary: |
| soup = BeautifulSoup(summary, 'html.parser') |
| summary = soup.get_text().strip() |
| |
| if len(summary) > 300: |
| summary = summary[:297] + '...' |
|
|
| |
| impact = self._determine_impact(title, summary) |
| sentiment = self._determine_sentiment(title, summary) |
|
|
| news_item = { |
| 'title': title, |
| 'summary': summary or title, |
| 'source': source_name, |
| 'url': entry.get('link', ''), |
| 'timestamp': timestamp, |
| 'category': category, |
| 'impact': impact, |
| 'sentiment': sentiment, |
| 'is_breaking': self._is_breaking_news(title, summary), |
| 'likes': 0, |
| 'retweets': 0, |
| 'reddit_score': 0, |
| 'reddit_comments': 0 |
| } |
|
|
| news_items.append(news_item) |
|
|
| except Exception as e: |
| logger.error(f"Error parsing entry from {source_name}: {e}") |
| continue |
|
|
| except Exception as e: |
| logger.error(f"Error fetching RSS feed {feed_url}: {e}") |
|
|
| return news_items |
|
|
| def _determine_impact(self, title: str, summary: str) -> str: |
| """Determine impact level based on keywords""" |
| text = f"{title} {summary}".lower() |
|
|
| high_impact_keywords = [ |
| 'breakthrough', 'announce', 'launch', 'release', 'new model', |
| 'gpt', 'claude', 'gemini', 'llama', 'chatgpt', |
| 'billion', 'trillion', 'acquisition', 'merger', |
| 'regulation', 'ban', 'lawsuit', 'security breach', |
| 'major', 'significant', 'revolutionary', 'first-ever' |
| ] |
|
|
| medium_impact_keywords = [ |
| 'update', 'improve', 'enhance', 'study', 'research', |
| 'partnership', 'collaboration', 'funding', 'investment', |
| 'expands', 'grows', 'adopts', 'implements' |
| ] |
|
|
| for keyword in high_impact_keywords: |
| if keyword in text: |
| return 'high' |
|
|
| for keyword in medium_impact_keywords: |
| if keyword in text: |
| return 'medium' |
|
|
| return 'low' |
|
|
| def _determine_sentiment(self, title: str, summary: str) -> str: |
| """Determine sentiment based on keywords""" |
| text = f"{title} {summary}".lower() |
|
|
| positive_keywords = [ |
| 'breakthrough', 'success', 'achieve', 'improve', 'advance', |
| 'innovative', 'revolutionary', 'launch', 'release', 'win', |
| 'growth', 'expand', 'partnership', 'collaboration' |
| ] |
|
|
| negative_keywords = [ |
| 'fail', 'issue', 'problem', 'concern', 'worry', 'risk', |
| 'ban', 'lawsuit', 'breach', 'hack', 'leak', 'crisis', |
| 'decline', 'loss', 'shutdown', 'controversy' |
| ] |
|
|
| positive_count = sum(1 for kw in positive_keywords if kw in text) |
| negative_count = sum(1 for kw in negative_keywords if kw in text) |
|
|
| if positive_count > negative_count: |
| return 'positive' |
| elif negative_count > positive_count: |
| return 'negative' |
| else: |
| return 'neutral' |
|
|
| def _is_breaking_news(self, title: str, summary: str) -> bool: |
| """Determine if news is breaking""" |
| text = f"{title} {summary}".lower() |
|
|
| breaking_indicators = [ |
| 'breaking', 'just announced', 'just released', 'just launched', |
| 'alert', 'urgent', 'developing', 'live', 'now:' |
| ] |
|
|
| return any(indicator in text for indicator in breaking_indicators) |
|
|
| def get_statistics(self) -> Dict: |
| """Get statistics - returns empty for backward compatibility""" |
| return { |
| 'total': 0, |
| 'high_impact': 0, |
| 'breaking': 0, |
| 'last_update': 'Managed by cache', |
| 'by_category': { |
| 'ai': 0, |
| 'tech': 0, |
| 'research': 0 |
| } |
| } |
|
|