Kunal
rewitten extrace_info_from_url function to use scrapper.py
d1245ca
import requests
from bs4 import BeautifulSoup
def scrape_product_info(url: str) -> str:
"""
Scrape product information from a given URL
"""
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Extract product information from common e-commerce patterns
product_info = {}
# Try to find product title
title_selectors = [
'h1[data-testid="product-title"]', # Amazon
'.product-title',
'.product-name',
'h1.product_title',
'.pdp-product-name', # Flipkart
'[data-automation-id="product-title"]',
'h1'
]
title = None
for selector in title_selectors:
element = soup.select_one(selector)
if element:
title = element.get_text(strip=True)
break
if not title:
# Fallback to page title
title_tag = soup.find('title')
title = title_tag.get_text(strip=True) if title_tag else "Product"
product_info['title'] = title
# Try to find product description
description_selectors = [
'.product-description',
'.product-details',
'[data-testid="product-description"]',
'.product-summary',
'.pdp-product-description-content',
'.feature-bullets ul',
'.a-unordered-list.a-vertical'
]
description_parts = []
for selector in description_selectors:
elements = soup.select(selector)
for element in elements:
text = element.get_text(strip=True)
if text and len(text) > 20: # Filter out short/empty descriptions
description_parts.append(text)
# Try to find specifications
spec_selectors = [
'.product-specifications',
'.tech-specs',
'.product-details-table',
'.specification-table',
'[data-testid="specifications"]'
]
specs = []
for selector in spec_selectors:
elements = soup.select(selector)
for element in elements:
text = element.get_text(strip=True)
if text:
specs.append(text)
# Combine all information
full_description = f"Product: {title}\n\n"
if description_parts:
full_description += "Description: " + " ".join(description_parts[:3]) + "\n\n"
if specs:
full_description += "Specifications: " + " ".join(specs[:2])
return full_description[:2000] # Limit length
except Exception as e:
# st.error(f"Error scraping product information: {str(e)}")
return f"Unable to extract product information from the provided URL. Please enter product description manually."