ravimohan19 commited on
Commit
4e03699
Β·
verified Β·
1 Parent(s): 02dda50

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +440 -0
app.py ADDED
@@ -0,0 +1,440 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio UI for the Polymer Datasheet Crawler Agent.
3
+ Deployable as a HuggingFace Space.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import json
9
+ import logging
10
+ import os
11
+ import tempfile
12
+ from pathlib import Path
13
+
14
+ import gradio as gr
15
+ import pandas as pd
16
+
17
+ from graph import (
18
+ build_graph,
19
+ db,
20
+ run_search,
21
+ run_upload,
22
+ search_database,
23
+ get_database_summary,
24
+ )
25
+ from pdf_extractor import extract_text_from_pdf
26
+ from models import DatasheetRecord
27
+
28
+ logging.basicConfig(
29
+ level=logging.INFO,
30
+ format="%(asctime)s | %(name)s | %(levelname)s | %(message)s",
31
+ )
32
+ logger = logging.getLogger(__name__)
33
+
34
+
35
+ # ══════════════════════════════════════════════════════════════════════════════
36
+ # Handler Functions
37
+ # ══════════════════════════════════════════════════════════════════════════════
38
+
39
+ def handle_search(
40
+ manufacturer: str,
41
+ polymer_family: str,
42
+ grade: str,
43
+ progress=gr.Progress(),
44
+ ) -> tuple[str, pd.DataFrame, str]:
45
+ """
46
+ Handle the 'Search & Add' tab: run the full LangGraph workflow
47
+ to search, parse, and store a datasheet.
48
+ """
49
+ if not manufacturer.strip() and not polymer_family.strip():
50
+ return (
51
+ "⚠️ Please provide at least a manufacturer or polymer family.",
52
+ pd.DataFrame(),
53
+ "",
54
+ )
55
+
56
+ progress(0.1, desc="Initializing search...")
57
+ try:
58
+ progress(0.3, desc="Searching the web with Tavily...")
59
+ result = run_search(
60
+ manufacturer=manufacturer.strip(),
61
+ polymer_family=polymer_family.strip(),
62
+ grade=grade.strip(),
63
+ )
64
+ progress(0.9, desc="Done!")
65
+
66
+ status = result.get("status", "unknown")
67
+ message = result.get("message", "")
68
+
69
+ # Build display dataframe from parsed record
70
+ parsed = result.get("parsed_datasheet")
71
+ display_df = pd.DataFrame()
72
+ json_output = ""
73
+
74
+ if parsed:
75
+ record = DatasheetRecord(**parsed) if isinstance(parsed, dict) else parsed
76
+ flat = record.to_flat_dict()
77
+ # Filter out empty values and metadata for display
78
+ display_data = {
79
+ k: v for k, v in flat.items()
80
+ if v and k not in ("id", "created_at")
81
+ }
82
+ display_df = pd.DataFrame(
83
+ list(display_data.items()),
84
+ columns=["Property", "Value"],
85
+ )
86
+ json_output = json.dumps(flat, indent=2)
87
+
88
+ status_icon = "βœ…" if status == "success" else "❌"
89
+ return f"{status_icon} {message}", display_df, json_output
90
+
91
+ except Exception as exc:
92
+ logger.exception("Search handler error")
93
+ return f"❌ Error: {exc}", pd.DataFrame(), ""
94
+
95
+
96
+ def handle_upload(
97
+ file_obj,
98
+ progress=gr.Progress(),
99
+ ) -> tuple[str, pd.DataFrame, str]:
100
+ """
101
+ Handle the 'Upload Datasheet' tab: extract text from PDF,
102
+ then run the LangGraph workflow in upload mode.
103
+ """
104
+ if file_obj is None:
105
+ return "⚠️ Please upload a PDF file.", pd.DataFrame(), ""
106
+
107
+ progress(0.1, desc="Reading PDF...")
108
+ try:
109
+ # Gradio gives us a file path
110
+ file_path = file_obj.name if hasattr(file_obj, "name") else str(file_obj)
111
+ extracted_text = extract_text_from_pdf(file_path)
112
+
113
+ if not extracted_text.strip():
114
+ return (
115
+ "⚠️ Could not extract text from the PDF. "
116
+ "It may be image-based (scanned). Try a text-based PDF.",
117
+ pd.DataFrame(),
118
+ "",
119
+ )
120
+
121
+ progress(0.4, desc="Parsing with LLM...")
122
+ result = run_upload(uploaded_text=extracted_text)
123
+ progress(0.9, desc="Done!")
124
+
125
+ status = result.get("status", "unknown")
126
+ message = result.get("message", "")
127
+
128
+ parsed = result.get("parsed_datasheet")
129
+ display_df = pd.DataFrame()
130
+ json_output = ""
131
+
132
+ if parsed:
133
+ record = DatasheetRecord(**parsed) if isinstance(parsed, dict) else parsed
134
+ flat = record.to_flat_dict()
135
+ display_data = {
136
+ k: v for k, v in flat.items()
137
+ if v and k not in ("id", "created_at")
138
+ }
139
+ display_df = pd.DataFrame(
140
+ list(display_data.items()),
141
+ columns=["Property", "Value"],
142
+ )
143
+ json_output = json.dumps(flat, indent=2)
144
+
145
+ status_icon = "βœ…" if status == "success" else "❌"
146
+ return f"{status_icon} {message}", display_df, json_output
147
+
148
+ except Exception as exc:
149
+ logger.exception("Upload handler error")
150
+ return f"❌ Error: {exc}", pd.DataFrame(), ""
151
+
152
+
153
+ def handle_db_search(
154
+ query: str,
155
+ manufacturer: str,
156
+ polymer_family: str,
157
+ ) -> pd.DataFrame:
158
+ """Search the database and return results."""
159
+ try:
160
+ df = search_database(
161
+ query=query.strip(),
162
+ manufacturer=manufacturer.strip(),
163
+ polymer_family=polymer_family.strip(),
164
+ )
165
+ if df.empty:
166
+ return pd.DataFrame({"Info": ["No matching records found."]})
167
+ return df
168
+ except Exception as exc:
169
+ logger.exception("DB search error")
170
+ return pd.DataFrame({"Error": [str(exc)]})
171
+
172
+
173
+ def handle_db_summary() -> tuple[pd.DataFrame, str]:
174
+ """Get the full database summary."""
175
+ try:
176
+ df = get_database_summary()
177
+ count = db.count()
178
+ info = f"πŸ“Š Database contains {count} datasheet(s)."
179
+ if df.empty:
180
+ return pd.DataFrame({"Info": ["Database is empty."]}), info
181
+ return df, info
182
+ except Exception as exc:
183
+ logger.exception("DB summary error")
184
+ return pd.DataFrame({"Error": [str(exc)]}), f"❌ Error: {exc}"
185
+
186
+
187
+ def handle_export_csv() -> str | None:
188
+ """Export the entire database to a CSV file for download."""
189
+ try:
190
+ df = db.get_all_dataframe()
191
+ if df.empty:
192
+ return None
193
+ tmp = tempfile.NamedTemporaryFile(
194
+ suffix=".csv", delete=False, mode="w", encoding="utf-8",
195
+ )
196
+ df.to_csv(tmp.name, index=False)
197
+ tmp.close()
198
+ return tmp.name
199
+ except Exception as exc:
200
+ logger.exception("Export error")
201
+ return None
202
+
203
+
204
+ # ══════════════════════════════════════════════════════════════════════════════
205
+ # Gradio App
206
+ # ══════════════════════════════════════════════════════════════════════════════
207
+
208
+ def create_app() -> gr.Blocks:
209
+ """Build the Gradio Blocks application."""
210
+
211
+ with gr.Blocks(
212
+ title="πŸ§ͺ Polymer Datasheet Agent",
213
+ theme=gr.themes.Soft(),
214
+ css="""
215
+ .header { text-align: center; margin-bottom: 1em; }
216
+ .status-box { font-size: 1.1em; font-weight: 600; padding: 0.5em; }
217
+ """,
218
+ ) as app:
219
+
220
+ # ── Header ───────────────────────────────────────────────────────
221
+ gr.Markdown(
222
+ """
223
+ # πŸ§ͺ Polymer Datasheet Crawler Agent
224
+ **Build a searchable database of commercial polymer datasheets.**
225
+
226
+ This agent uses **Tavily** to search the web for technical datasheets,
227
+ **LLaMA 3.1** to extract structured properties, and stores results in
228
+ a local **SQLite** database.
229
+
230
+ ---
231
+ """,
232
+ elem_classes=["header"],
233
+ )
234
+
235
+ # ── Tab 1: Search & Add ──────────────────────────────────────────
236
+ with gr.Tab("πŸ” Search & Add Datasheet"):
237
+ gr.Markdown(
238
+ "Enter a manufacturer and/or polymer family to search for "
239
+ "datasheets online and add them to the database."
240
+ )
241
+
242
+ with gr.Row():
243
+ manufacturer_input = gr.Textbox(
244
+ label="Manufacturer",
245
+ placeholder="e.g., SABIC, BASF, DuPont",
246
+ scale=2,
247
+ )
248
+ polymer_input = gr.Textbox(
249
+ label="Polymer Family",
250
+ placeholder="e.g., Polycarbonate, Nylon 6,6, PEEK",
251
+ scale=2,
252
+ )
253
+ grade_input = gr.Textbox(
254
+ label="Grade (optional)",
255
+ placeholder="e.g., Lexan 141R, Ultramid A3K",
256
+ scale=2,
257
+ )
258
+
259
+ search_btn = gr.Button("πŸ” Search & Add", variant="primary", size="lg")
260
+
261
+ search_status = gr.Textbox(
262
+ label="Status",
263
+ interactive=False,
264
+ elem_classes=["status-box"],
265
+ )
266
+
267
+ with gr.Accordion("Extracted Properties", open=True):
268
+ search_table = gr.Dataframe(
269
+ label="Parsed Datasheet",
270
+ interactive=False,
271
+ wrap=True,
272
+ )
273
+
274
+ with gr.Accordion("Raw JSON Output", open=False):
275
+ search_json = gr.Code(
276
+ label="JSON",
277
+ language="json",
278
+ interactive=False,
279
+ )
280
+
281
+ search_btn.click(
282
+ fn=handle_search,
283
+ inputs=[manufacturer_input, polymer_input, grade_input],
284
+ outputs=[search_status, search_table, search_json],
285
+ )
286
+
287
+ # ── Tab 2: Upload Datasheet ──────────────────────────────────────
288
+ with gr.Tab("πŸ“„ Upload Datasheet"):
289
+ gr.Markdown(
290
+ "Upload a PDF datasheet to extract properties and add to the database."
291
+ )
292
+
293
+ file_input = gr.File(
294
+ label="Upload PDF Datasheet",
295
+ file_types=[".pdf"],
296
+ type="filepath",
297
+ )
298
+ upload_btn = gr.Button("πŸ“„ Parse & Add", variant="primary", size="lg")
299
+
300
+ upload_status = gr.Textbox(
301
+ label="Status",
302
+ interactive=False,
303
+ elem_classes=["status-box"],
304
+ )
305
+
306
+ with gr.Accordion("Extracted Properties", open=True):
307
+ upload_table = gr.Dataframe(
308
+ label="Parsed Datasheet",
309
+ interactive=False,
310
+ wrap=True,
311
+ )
312
+
313
+ with gr.Accordion("Raw JSON Output", open=False):
314
+ upload_json = gr.Code(
315
+ label="JSON",
316
+ language="json",
317
+ interactive=False,
318
+ )
319
+
320
+ upload_btn.click(
321
+ fn=handle_upload,
322
+ inputs=[file_input],
323
+ outputs=[upload_status, upload_table, upload_json],
324
+ )
325
+
326
+ # ── Tab 3: Database Browser ──────────────────────────────────────
327
+ with gr.Tab("πŸ—„οΈ Database Browser"):
328
+ gr.Markdown("Search and browse the existing datasheet database.")
329
+
330
+ with gr.Row():
331
+ db_query = gr.Textbox(
332
+ label="Search query",
333
+ placeholder="Free text search across all fields...",
334
+ scale=3,
335
+ )
336
+ db_manufacturer = gr.Textbox(
337
+ label="Filter: Manufacturer",
338
+ placeholder="e.g., BASF",
339
+ scale=2,
340
+ )
341
+ db_polymer = gr.Textbox(
342
+ label="Filter: Polymer Family",
343
+ placeholder="e.g., Polyamide",
344
+ scale=2,
345
+ )
346
+
347
+ with gr.Row():
348
+ db_search_btn = gr.Button("πŸ” Search Database", variant="primary")
349
+ db_refresh_btn = gr.Button("πŸ”„ Show All Records")
350
+ db_export_btn = gr.Button("πŸ“₯ Export to CSV")
351
+
352
+ db_info = gr.Textbox(label="Info", interactive=False)
353
+
354
+ db_results = gr.Dataframe(
355
+ label="Database Records",
356
+ interactive=False,
357
+ wrap=True,
358
+ )
359
+
360
+ export_file = gr.File(label="Download CSV", visible=True)
361
+
362
+ db_search_btn.click(
363
+ fn=handle_db_search,
364
+ inputs=[db_query, db_manufacturer, db_polymer],
365
+ outputs=[db_results],
366
+ )
367
+
368
+ db_refresh_btn.click(
369
+ fn=handle_db_summary,
370
+ inputs=[],
371
+ outputs=[db_results, db_info],
372
+ )
373
+
374
+ db_export_btn.click(
375
+ fn=handle_export_csv,
376
+ inputs=[],
377
+ outputs=[export_file],
378
+ )
379
+
380
+ # ── Tab 4: About / Help ──────────────────────────────────────────
381
+ with gr.Tab("ℹ️ About"):
382
+ gr.Markdown(
383
+ """
384
+ ## Architecture
385
+
386
+ This application is built with:
387
+
388
+ - **[LangGraph](https://github.com/langchain-ai/langgraph)** β€”
389
+ Orchestrates the agent workflow as a directed state graph.
390
+ - **[Tavily](https://tavily.com)** β€”
391
+ AI-optimized web search API for finding datasheets.
392
+ - **[LLaMA 3.1](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct)** β€”
393
+ Open-source LLM via HuggingFace Inference API for structured extraction.
394
+ - **SQLite + SQLAlchemy** β€” Local relational database.
395
+ - **[Gradio](https://gradio.app)** β€” Web UI, deployable on HuggingFace Spaces.
396
+
397
+ ## Workflow
398
+
399
+ ```
400
+ User Input ──► Router ──► Web Search (Tavily) ──► LLM Parse (LLaMA 3.1) ──► Store DB ──► Output
401
+ β”‚ β–²
402
+ └──► Process Upload (PDF) β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
403
+ ```
404
+
405
+ ## Property Categories
406
+
407
+ The agent extracts properties across these categories:
408
+ - **General**: Material name, trade name, manufacturer, grade, applications
409
+ - **Mechanical**: Tensile/flexural strength, modulus, impact, hardness
410
+ - **Thermal**: Tm, Tg, HDT, Vicat, CTE, thermal conductivity
411
+ - **Physical**: Density, MFI, water absorption, specific gravity
412
+ - **Electrical**: Dielectric strength/constant, resistivity
413
+ - **Chemical Resistance**: Acid, alkali, solvent, UV resistance
414
+ - **Regulatory**: FDA, RoHS, REACH, UL94
415
+
416
+ ## Data Sources
417
+
418
+ The crawler prioritizes trusted sources including:
419
+ MatWeb, Omnexus, UL Prospector, Campus Plastics,
420
+ and official manufacturer portals (SABIC, BASF, DuPont, Dow, etc.)
421
+
422
+ ---
423
+ *Built for Plinity β€” Infinite Recyclable Polymers Project*
424
+ """
425
+ )
426
+
427
+ return app
428
+
429
+
430
+ # ══════════════════════════════════════════════════════════════════════════════
431
+ # Main
432
+ # ══════════════════════════════════════════════════════════════════════════════
433
+
434
+ if __name__ == "__main__":
435
+ app = create_app()
436
+ app.launch(
437
+ server_name="0.0.0.0",
438
+ server_port=7860,
439
+ share=False,
440
+ )