Spaces:

Agents-MCP-Hackathon
/

Environmental-Impact-Analyzer

Sleeping

App Files Files Community

Kunal commited on Jun 6, 2025

Commit

d1245ca

1 Parent(s): 0929a94

rewitten extrace_info_from_url function to use scrapper.py

Browse files

Files changed (2) hide show

Agent.py +40 -36
scrapper.py +95 -0

Agent.py CHANGED Viewed

@@ -11,8 +11,8 @@ from huggingface_hub import InferenceClient
 from langchain.prompts import ChatPromptTemplate
 from langgraph.graph import StateGraph, START, END
-# from unity_functions import parse_extraction_result, parse_scoring_result
-# from scrapper import scrape_product_info
 # Load environment variables from .env file
 load_dotenv()
@@ -40,46 +40,50 @@ def extract_product_from_url(state: EnvironmentalAnalysisState):
     """
     Extract product information from URL and update the product description
     """
-    extraction_prompt = ChatPromptTemplate.from_template("""
-    You are an expert environmental analyst. You are given a URL to a product page. Your job is to extract environmental impact data by simulating access to the product's specifications and making reasonable assumptions if exact data is missing.
-    Product URL: {product_description}
-    Carefully extract or infer the following in JSON format:
-    {{
-        "material_composition": "main materials used (e.g., plastic, metal, organic cotton)",
-        "manufacturing_location": "country or region of manufacturing",
-        "product_weight": numeric_value_in_kg,
-        "transport_distance": numeric_value_in_km (assumed from manufacturing to India),
-        "packaging_type": "type and material of packaging (e.g., cardboard box, plastic wrap)",
-        "energy_usage": numeric_value_in_kwh (if applicable or estimated),
-        "recyclability": "recyclable / biodegradable / non-recyclable",
-        "durability": "expected lifespan in years",
-        "certifications": "environmental or sustainability certifications (e.g., Energy Star, Fair Trade)"
-    }}
-    If some data is not explicitly available on the page, infer based on similar products or industry standards.
-    Use domain knowledge and mention estimates clearly.
-    """)
-    messages = [
-        {
-            "role": "user",
-            "content": extraction_prompt
         }
-    ]
-    result = llm.chat.completions.create(
-        model="deepseek-ai/DeepSeek-R1",  # or another suitable model available on Hyperbolic
-        messages=messages,
-        max_tokens=1000,
-        temperature=0.2
-    )
-    extracted_data = parse_extraction_result(result.choices[0].message.content)
-    return {
-        "extracted_data": extracted_data,
-        "messages": [result.choices[0].message]
-    }
 def extract_product_info(state: EnvironmentalAnalysisState):
     extraction_prompt = ChatPromptTemplate.from_template("""

 from langchain.prompts import ChatPromptTemplate
 from langgraph.graph import StateGraph, START, END
+from unity_functions import parse_extraction_result, parse_scoring_result
+from scrapper import scrape_product_info
 # Load environment variables from .env file
 load_dotenv()
     """
     Extract product information from URL and update the product description
     """
+    if state.get("product_description"):
+        scraped_info = scrape_product_info(state["product_description"])
+        extraction_prompt = ChatPromptTemplate.from_template("""
+        Analyze the following scraped product information and extract environmental impact factors.
+        Product: {scraped_info}
+        Please provide the information in the following JSON format:
+        {{
+            "material_composition": "main materials used",
+            "manufacturing_location": "country or region",
+            "product_weight": numeric_value_in_kg,
+            "transport_distance": numeric_value_in_km,
+            "packaging_type": "packaging materials and type",
+            "energy_usage": numeric_value_in_kwh,
+            "recyclability": "recyclable/biodegradable/non-recyclable",
+            "durability": "estimated lifespan",
+            "certifications": "environmental certifications if any"
+        }}
+        If specific values are not mentioned, make reasonable estimates based on typical products of this type.
+        """)
+        messages = [
+            {
+                "role": "user",
+                "content": extraction_prompt
+            }
+        ]
+        result = llm.chat.completions.create(
+            model="deepseek-ai/DeepSeek-R1",
+            messages=messages,
+            max_tokens=1000,
+            temperature=0.2
+        )
+        extracted_data = parse_extraction_result(result.choices[0].message.content)
+        return {
+            "extracted_data": extracted_data,
+            "messages": [result.choices[0].message]
         }
 def extract_product_info(state: EnvironmentalAnalysisState):
     extraction_prompt = ChatPromptTemplate.from_template("""

scrapper.py CHANGED Viewed

	@@ -0,0 +1,95 @@

+import requests
+from bs4 import BeautifulSoup
+def scrape_product_info(url: str) -> str:
+    """
+    Scrape product information from a given URL
+    """
+    try:
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+        }
+        response = requests.get(url, headers=headers, timeout=10)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.content, 'html.parser')
+        # Extract product information from common e-commerce patterns
+        product_info = {}
+        # Try to find product title
+        title_selectors = [
+            'h1[data-testid="product-title"]',  # Amazon
+            '.product-title',
+            '.product-name',
+            'h1.product_title',
+            '.pdp-product-name',  # Flipkart
+            '[data-automation-id="product-title"]',
+            'h1'
+        ]
+        title = None
+        for selector in title_selectors:
+            element = soup.select_one(selector)
+            if element:
+                title = element.get_text(strip=True)
+                break
+        if not title:
+            # Fallback to page title
+            title_tag = soup.find('title')
+            title = title_tag.get_text(strip=True) if title_tag else "Product"
+        product_info['title'] = title
+        # Try to find product description
+        description_selectors = [
+            '.product-description',
+            '.product-details',
+            '[data-testid="product-description"]',
+            '.product-summary',
+            '.pdp-product-description-content',
+            '.feature-bullets ul',
+            '.a-unordered-list.a-vertical'
+        ]
+        description_parts = []
+        for selector in description_selectors:
+            elements = soup.select(selector)
+            for element in elements:
+                text = element.get_text(strip=True)
+                if text and len(text) > 20:  # Filter out short/empty descriptions
+                    description_parts.append(text)
+        # Try to find specifications
+        spec_selectors = [
+            '.product-specifications',
+            '.tech-specs',
+            '.product-details-table',
+            '.specification-table',
+            '[data-testid="specifications"]'
+        ]
+        specs = []
+        for selector in spec_selectors:
+            elements = soup.select(selector)
+            for element in elements:
+                text = element.get_text(strip=True)
+                if text:
+                    specs.append(text)
+        # Combine all information
+        full_description = f"Product: {title}\n\n"
+        if description_parts:
+            full_description += "Description: " + " ".join(description_parts[:3]) + "\n\n"
+        if specs:
+            full_description += "Specifications: " + " ".join(specs[:2])
+        return full_description[:2000]  # Limit length
+    except Exception as e:
+        # st.error(f"Error scraping product information: {str(e)}")
+        return f"Unable to extract product information from the provided URL. Please enter product description manually."