| import requests |
| from bs4 import BeautifulSoup |
|
|
| def scrape_product_info(url: str) -> str: |
| """ |
| Scrape product information from a given URL |
| """ |
| try: |
| headers = { |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' |
| } |
| |
| response = requests.get(url, headers=headers, timeout=10) |
| response.raise_for_status() |
| |
| soup = BeautifulSoup(response.content, 'html.parser') |
| |
| |
| product_info = {} |
| |
| |
| title_selectors = [ |
| 'h1[data-testid="product-title"]', |
| '.product-title', |
| '.product-name', |
| 'h1.product_title', |
| '.pdp-product-name', |
| '[data-automation-id="product-title"]', |
| 'h1' |
| ] |
| |
| title = None |
| for selector in title_selectors: |
| element = soup.select_one(selector) |
| if element: |
| title = element.get_text(strip=True) |
| break |
| |
| if not title: |
| |
| title_tag = soup.find('title') |
| title = title_tag.get_text(strip=True) if title_tag else "Product" |
| |
| product_info['title'] = title |
| |
| |
| description_selectors = [ |
| '.product-description', |
| '.product-details', |
| '[data-testid="product-description"]', |
| '.product-summary', |
| '.pdp-product-description-content', |
| '.feature-bullets ul', |
| '.a-unordered-list.a-vertical' |
| ] |
| |
| description_parts = [] |
| for selector in description_selectors: |
| elements = soup.select(selector) |
| for element in elements: |
| text = element.get_text(strip=True) |
| if text and len(text) > 20: |
| description_parts.append(text) |
| |
| |
| spec_selectors = [ |
| '.product-specifications', |
| '.tech-specs', |
| '.product-details-table', |
| '.specification-table', |
| '[data-testid="specifications"]' |
| ] |
| |
| specs = [] |
| for selector in spec_selectors: |
| elements = soup.select(selector) |
| for element in elements: |
| text = element.get_text(strip=True) |
| if text: |
| specs.append(text) |
| |
| |
| full_description = f"Product: {title}\n\n" |
| |
| if description_parts: |
| full_description += "Description: " + " ".join(description_parts[:3]) + "\n\n" |
| |
| if specs: |
| full_description += "Specifications: " + " ".join(specs[:2]) |
| |
| return full_description[:2000] |
| |
| except Exception as e: |
| |
| return f"Unable to extract product information from the provided URL. Please enter product description manually." |
|
|