Spaces:

Agents-MCP-Hackathon
/

Environmental-Impact-Analyzer

Sleeping

File size: 3,298 Bytes

d1245ca

import requests
from bs4 import BeautifulSoup

def scrape_product_info(url: str) -> str:
    """
    Scrape product information from a given URL
    """
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract product information from common e-commerce patterns
        product_info = {}
        
        # Try to find product title
        title_selectors = [
            'h1[data-testid="product-title"]',  # Amazon
            '.product-title',
            '.product-name',
            'h1.product_title',
            '.pdp-product-name',  # Flipkart
            '[data-automation-id="product-title"]',
            'h1'
        ]
        
        title = None
        for selector in title_selectors:
            element = soup.select_one(selector)
            if element:
                title = element.get_text(strip=True)
                break
        
        if not title:
            # Fallback to page title
            title_tag = soup.find('title')
            title = title_tag.get_text(strip=True) if title_tag else "Product"
        
        product_info['title'] = title
        
        # Try to find product description
        description_selectors = [
            '.product-description',
            '.product-details',
            '[data-testid="product-description"]',
            '.product-summary',
            '.pdp-product-description-content',
            '.feature-bullets ul',
            '.a-unordered-list.a-vertical'
        ]
        
        description_parts = []
        for selector in description_selectors:
            elements = soup.select(selector)
            for element in elements:
                text = element.get_text(strip=True)
                if text and len(text) > 20:  # Filter out short/empty descriptions
                    description_parts.append(text)
        
        # Try to find specifications
        spec_selectors = [
            '.product-specifications',
            '.tech-specs',
            '.product-details-table',
            '.specification-table',
            '[data-testid="specifications"]'
        ]
        
        specs = []
        for selector in spec_selectors:
            elements = soup.select(selector)
            for element in elements:
                text = element.get_text(strip=True)
                if text:
                    specs.append(text)
        
        # Combine all information
        full_description = f"Product: {title}\n\n"
        
        if description_parts:
            full_description += "Description: " + " ".join(description_parts[:3]) + "\n\n"
        
        if specs:
            full_description += "Specifications: " + " ".join(specs[:2])
        
        return full_description[:2000]  # Limit length
        
    except Exception as e:
        # st.error(f"Error scraping product information: {str(e)}")
        return f"Unable to extract product information from the provided URL. Please enter product description manually."