| """ |
| Example 2: Data Ingestion - PDF and Web Scraping |
| |
| This example demonstrates: |
| - PDF document reading and processing |
| - Web article extraction |
| - News aggregation |
| - Intelligence extraction from documents |
| """ |
|
|
| import sys |
| sys.path.append('..') |
|
|
| from geobot.data_ingestion.pdf_reader import PDFReader, PDFProcessor |
| from geobot.data_ingestion.web_scraper import WebScraper, ArticleExtractor, NewsAggregator |
|
|
|
|
| def demo_pdf_processing(): |
| """Demonstrate PDF processing capabilities.""" |
| print("\n" + "=" * 80) |
| print("PDF Processing Demo") |
| print("=" * 80) |
|
|
| |
| processor = PDFProcessor() |
|
|
| print("\nPDF processing capabilities:") |
| print("- Text extraction from PDFs") |
| print("- Table extraction") |
| print("- Metadata extraction") |
| print("- Entity recognition (countries, organizations)") |
| print("- Keyword extraction") |
| print("- Risk assessment") |
| print("\nTo use: processor.process_document('path/to/document.pdf')") |
|
|
| |
| example_code = """ |
| # Process a single PDF |
| result = processor.process_document('intelligence_report.pdf') |
| |
| print(f"Title: {result['metadata'].get('title', 'Unknown')}") |
| print(f"Pages: {result['num_pages']}") |
| print(f"Keywords: {result['keywords']}") |
| print(f"Risk Level: {result['intelligence']['risk_level']}") |
| |
| # Process multiple PDFs |
| results = processor.batch_process('reports_directory/', '*.pdf') |
| """ |
|
|
| print("\nExample usage:") |
| print(example_code) |
|
|
|
|
| def demo_web_scraping(): |
| """Demonstrate web scraping capabilities.""" |
| print("\n" + "=" * 80) |
| print("Web Scraping Demo") |
| print("=" * 80) |
|
|
| |
| extractor = ArticleExtractor() |
|
|
| print("\nWeb scraping capabilities:") |
| print("- Extract articles from URLs") |
| print("- Clean HTML content") |
| print("- Extract metadata (author, date, etc.)") |
| print("- Multiple extraction methods (newspaper3k, trafilatura, BeautifulSoup)") |
|
|
| |
| example_url = "https://www.example.com/geopolitical-analysis" |
|
|
| print(f"\nExample: Extracting article from {example_url}") |
| print("(This is a demonstration - no actual web request is made)") |
|
|
| example_code = """ |
| # Extract article |
| article = extractor.extract_article(url) |
| |
| print(f"Title: {article['title']}") |
| print(f"Author: {article['authors']}") |
| print(f"Published: {article['publish_date']}") |
| print(f"Content length: {len(article['text'])} characters") |
| |
| # Extract multiple articles |
| urls = ['url1', 'url2', 'url3'] |
| articles = extractor.batch_extract(urls) |
| """ |
|
|
| print("\nExample usage:") |
| print(example_code) |
|
|
|
|
| def demo_news_aggregation(): |
| """Demonstrate news aggregation capabilities.""" |
| print("\n" + "=" * 80) |
| print("News Aggregation Demo") |
| print("=" * 80) |
|
|
| aggregator = NewsAggregator() |
|
|
| print("\nNews aggregation capabilities:") |
| print("- Aggregate from multiple sources") |
| print("- RSS feed support") |
| print("- Keyword filtering") |
| print("- Trending topic detection") |
| print("- Real-time monitoring") |
|
|
| |
| print("\nExample: Setting up news aggregation") |
|
|
| example_code = """ |
| # Add news sources |
| aggregator.add_source( |
| name='Reuters', |
| url='https://www.reuters.com/news/world', |
| source_type='rss' |
| ) |
| |
| aggregator.add_source( |
| name='Al Jazeera', |
| url='https://www.aljazeera.com/xml/rss/all.xml', |
| source_type='rss' |
| ) |
| |
| # Fetch news with keywords |
| keywords = ['sanctions', 'conflict', 'diplomacy', 'military'] |
| articles = aggregator.fetch_news(keywords) |
| |
| print(f"Found {len(articles)} relevant articles") |
| |
| # Get trending topics |
| topics = aggregator.get_trending_topics(articles, n_topics=10) |
| print("Trending topics:", topics) |
| |
| # Monitor sources continuously |
| def alert_callback(new_articles): |
| print(f"ALERT: {len(new_articles)} new relevant articles found") |
| for article in new_articles: |
| print(f" - {article['title']}") |
| |
| # Monitor every hour |
| aggregator.monitor_sources(keywords, callback=alert_callback, interval=3600) |
| """ |
|
|
| print(example_code) |
|
|
|
|
| def demo_intelligence_extraction(): |
| """Demonstrate intelligence extraction from documents.""" |
| print("\n" + "=" * 80) |
| print("Intelligence Extraction Demo") |
| print("=" * 80) |
|
|
| print("\nIntelligence extraction capabilities:") |
| print("- Country and organization detection") |
| print("- Conflict indicator detection") |
| print("- Risk level assessment") |
| print("- Document classification") |
| print("- Key phrase extraction") |
|
|
| example_code = """ |
| processor = PDFProcessor() |
| |
| # Extract intelligence from PDF |
| intel = processor.extract_intelligence('report.pdf') |
| |
| print("Intelligence Summary:") |
| print(f"Risk Level: {intel['intelligence']['risk_level']}") |
| print(f"Countries mentioned: {intel['intelligence']['mentioned_countries']}") |
| print(f"Conflict indicators: {intel['intelligence']['conflict_indicators']}") |
| print(f"Key topics: {intel['intelligence']['key_topics']}") |
| print(f"Document type: {intel['intelligence']['document_type']}") |
| """ |
|
|
| print("\nExample usage:") |
| print(example_code) |
|
|
|
|
| def main(): |
| print("=" * 80) |
| print("GeoBotv1 - Data Ingestion Examples") |
| print("=" * 80) |
| print("\nThis module demonstrates the data ingestion capabilities of GeoBotv1:") |
| print("1. PDF document processing") |
| print("2. Web scraping and article extraction") |
| print("3. News aggregation from multiple sources") |
| print("4. Intelligence extraction from documents") |
|
|
| demo_pdf_processing() |
| demo_web_scraping() |
| demo_news_aggregation() |
| demo_intelligence_extraction() |
|
|
| print("\n" + "=" * 80) |
| print("Data Ingestion Demo Complete") |
| print("=" * 80) |
| print("\nNote: Install required packages for full functionality:") |
| print(" pip install pypdf pdfplumber beautifulsoup4 newspaper3k trafilatura") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|