Kunal commited on
Commit
d1245ca
·
1 Parent(s): 0929a94

rewitten extrace_info_from_url function to use scrapper.py

Browse files
Files changed (2) hide show
  1. Agent.py +40 -36
  2. scrapper.py +95 -0
Agent.py CHANGED
@@ -11,8 +11,8 @@ from huggingface_hub import InferenceClient
11
  from langchain.prompts import ChatPromptTemplate
12
  from langgraph.graph import StateGraph, START, END
13
 
14
- # from unity_functions import parse_extraction_result, parse_scoring_result
15
- # from scrapper import scrape_product_info
16
 
17
  # Load environment variables from .env file
18
  load_dotenv()
@@ -40,46 +40,50 @@ def extract_product_from_url(state: EnvironmentalAnalysisState):
40
  """
41
  Extract product information from URL and update the product description
42
  """
43
- extraction_prompt = ChatPromptTemplate.from_template("""
44
- You are an expert environmental analyst. You are given a URL to a product page. Your job is to extract environmental impact data by simulating access to the product's specifications and making reasonable assumptions if exact data is missing.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
- Product URL: {product_description}
 
 
 
 
 
47
 
48
- Carefully extract or infer the following in JSON format:
49
- {{
50
- "material_composition": "main materials used (e.g., plastic, metal, organic cotton)",
51
- "manufacturing_location": "country or region of manufacturing",
52
- "product_weight": numeric_value_in_kg,
53
- "transport_distance": numeric_value_in_km (assumed from manufacturing to India),
54
- "packaging_type": "type and material of packaging (e.g., cardboard box, plastic wrap)",
55
- "energy_usage": numeric_value_in_kwh (if applicable or estimated),
56
- "recyclability": "recyclable / biodegradable / non-recyclable",
57
- "durability": "expected lifespan in years",
58
- "certifications": "environmental or sustainability certifications (e.g., Energy Star, Fair Trade)"
59
- }}
60
 
61
- If some data is not explicitly available on the page, infer based on similar products or industry standards.
62
- Use domain knowledge and mention estimates clearly.
63
- """)
64
 
65
- messages = [
66
- {
67
- "role": "user",
68
- "content": extraction_prompt
69
  }
70
- ]
71
 
72
- result = llm.chat.completions.create(
73
- model="deepseek-ai/DeepSeek-R1", # or another suitable model available on Hyperbolic
74
- messages=messages,
75
- max_tokens=1000,
76
- temperature=0.2
77
- )
78
- extracted_data = parse_extraction_result(result.choices[0].message.content)
79
- return {
80
- "extracted_data": extracted_data,
81
- "messages": [result.choices[0].message]
82
- }
83
 
84
  def extract_product_info(state: EnvironmentalAnalysisState):
85
  extraction_prompt = ChatPromptTemplate.from_template("""
 
11
  from langchain.prompts import ChatPromptTemplate
12
  from langgraph.graph import StateGraph, START, END
13
 
14
+ from unity_functions import parse_extraction_result, parse_scoring_result
15
+ from scrapper import scrape_product_info
16
 
17
  # Load environment variables from .env file
18
  load_dotenv()
 
40
  """
41
  Extract product information from URL and update the product description
42
  """
43
+ if state.get("product_description"):
44
+ scraped_info = scrape_product_info(state["product_description"])
45
+ extraction_prompt = ChatPromptTemplate.from_template("""
46
+ Analyze the following scraped product information and extract environmental impact factors.
47
+
48
+ Product: {scraped_info}
49
+
50
+ Please provide the information in the following JSON format:
51
+ {{
52
+ "material_composition": "main materials used",
53
+ "manufacturing_location": "country or region",
54
+ "product_weight": numeric_value_in_kg,
55
+ "transport_distance": numeric_value_in_km,
56
+ "packaging_type": "packaging materials and type",
57
+ "energy_usage": numeric_value_in_kwh,
58
+ "recyclability": "recyclable/biodegradable/non-recyclable",
59
+ "durability": "estimated lifespan",
60
+ "certifications": "environmental certifications if any"
61
+ }}
62
+
63
+ If specific values are not mentioned, make reasonable estimates based on typical products of this type.
64
+ """)
65
 
66
+ messages = [
67
+ {
68
+ "role": "user",
69
+ "content": extraction_prompt
70
+ }
71
+ ]
72
 
73
+ result = llm.chat.completions.create(
74
+ model="deepseek-ai/DeepSeek-R1",
75
+ messages=messages,
76
+ max_tokens=1000,
77
+ temperature=0.2
78
+ )
 
 
 
 
 
 
79
 
80
+ extracted_data = parse_extraction_result(result.choices[0].message.content)
 
 
81
 
82
+ return {
83
+ "extracted_data": extracted_data,
84
+ "messages": [result.choices[0].message]
 
85
  }
 
86
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
  def extract_product_info(state: EnvironmentalAnalysisState):
89
  extraction_prompt = ChatPromptTemplate.from_template("""
scrapper.py CHANGED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+
4
+ def scrape_product_info(url: str) -> str:
5
+ """
6
+ Scrape product information from a given URL
7
+ """
8
+ try:
9
+ headers = {
10
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
11
+ }
12
+
13
+ response = requests.get(url, headers=headers, timeout=10)
14
+ response.raise_for_status()
15
+
16
+ soup = BeautifulSoup(response.content, 'html.parser')
17
+
18
+ # Extract product information from common e-commerce patterns
19
+ product_info = {}
20
+
21
+ # Try to find product title
22
+ title_selectors = [
23
+ 'h1[data-testid="product-title"]', # Amazon
24
+ '.product-title',
25
+ '.product-name',
26
+ 'h1.product_title',
27
+ '.pdp-product-name', # Flipkart
28
+ '[data-automation-id="product-title"]',
29
+ 'h1'
30
+ ]
31
+
32
+ title = None
33
+ for selector in title_selectors:
34
+ element = soup.select_one(selector)
35
+ if element:
36
+ title = element.get_text(strip=True)
37
+ break
38
+
39
+ if not title:
40
+ # Fallback to page title
41
+ title_tag = soup.find('title')
42
+ title = title_tag.get_text(strip=True) if title_tag else "Product"
43
+
44
+ product_info['title'] = title
45
+
46
+ # Try to find product description
47
+ description_selectors = [
48
+ '.product-description',
49
+ '.product-details',
50
+ '[data-testid="product-description"]',
51
+ '.product-summary',
52
+ '.pdp-product-description-content',
53
+ '.feature-bullets ul',
54
+ '.a-unordered-list.a-vertical'
55
+ ]
56
+
57
+ description_parts = []
58
+ for selector in description_selectors:
59
+ elements = soup.select(selector)
60
+ for element in elements:
61
+ text = element.get_text(strip=True)
62
+ if text and len(text) > 20: # Filter out short/empty descriptions
63
+ description_parts.append(text)
64
+
65
+ # Try to find specifications
66
+ spec_selectors = [
67
+ '.product-specifications',
68
+ '.tech-specs',
69
+ '.product-details-table',
70
+ '.specification-table',
71
+ '[data-testid="specifications"]'
72
+ ]
73
+
74
+ specs = []
75
+ for selector in spec_selectors:
76
+ elements = soup.select(selector)
77
+ for element in elements:
78
+ text = element.get_text(strip=True)
79
+ if text:
80
+ specs.append(text)
81
+
82
+ # Combine all information
83
+ full_description = f"Product: {title}\n\n"
84
+
85
+ if description_parts:
86
+ full_description += "Description: " + " ".join(description_parts[:3]) + "\n\n"
87
+
88
+ if specs:
89
+ full_description += "Specifications: " + " ".join(specs[:2])
90
+
91
+ return full_description[:2000] # Limit length
92
+
93
+ except Exception as e:
94
+ # st.error(f"Error scraping product information: {str(e)}")
95
+ return f"Unable to extract product information from the provided URL. Please enter product description manually."