SEUyishu commited on
Commit
8f46cf0
·
verified ·
1 Parent(s): 84a8f07

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +626 -627
app.py CHANGED
@@ -1,627 +1,626 @@
1
- #!/usr/bin/env python3
2
- """
3
- MaTableGPT Gradio Web Interface
4
- ================================
5
-
6
- A web interface for the MaTableGPT MCP service.
7
- Provides an interactive UI for table data extraction from materials science literature.
8
-
9
- For HuggingFace Spaces deployment.
10
- """
11
-
12
- import os
13
- import json
14
- import logging
15
- import gradio as gr
16
- from typing import Optional, Tuple, Dict, Any
17
-
18
- # Configure logging
19
- logging.basicConfig(level=logging.INFO)
20
- logger = logging.getLogger("matablgpt-app")
21
-
22
- # Import MCP service components
23
- try:
24
- from mcp_service import (
25
- table_representer,
26
- table_to_json,
27
- table_splitter,
28
- session_manager,
29
- get_extractor,
30
- GPTExtractor
31
- )
32
- MCP_AVAILABLE = True
33
- except ImportError as e:
34
- logger.warning(f"MCP service not available: {e}")
35
- MCP_AVAILABLE = False
36
-
37
- # =============================================================================
38
- # Helper Functions
39
- # =============================================================================
40
-
41
- def format_json_output(data: Any) -> str:
42
- """Format data as pretty JSON string."""
43
- try:
44
- return json.dumps(data, indent=2, ensure_ascii=False)
45
- except:
46
- return str(data)
47
-
48
-
49
- def check_openai_config() -> Tuple[bool, str]:
50
- """Check if API configuration is complete (supports third-party services)."""
51
- # Check multiple env var names
52
- key = (
53
- os.environ.get('LLM_API_KEY', '') or
54
- os.environ.get('OPENAI_API_KEY', '')
55
- )
56
- base_url = (
57
- os.environ.get('LLM_API_BASE', '') or
58
- os.environ.get('OPENAI_API_BASE', '') or
59
- os.environ.get('OPENAI_BASE_URL', '')
60
- )
61
- model = (
62
- os.environ.get('LLM_MODEL', '') or
63
- os.environ.get('OPENAI_MODEL', '') or
64
- 'gpt-4-turbo-preview'
65
- )
66
-
67
- status_parts = []
68
-
69
- if key:
70
- status_parts.append(f"✅ API Key: ***{key[-4:]}")
71
- else:
72
- return False, "⚠️ API Key not configured (set LLM_API_KEY or OPENAI_API_KEY). GPT extraction will not work."
73
-
74
- if base_url:
75
- # Show shortened URL
76
- display_url = base_url if len(base_url) <= 35 else base_url[:32] + "..."
77
- status_parts.append(f"✅ API URL: {display_url}")
78
- else:
79
- return False, "⚠️ API Base URL not configured (set LLM_API_BASE or OPENAI_API_BASE). Required for third-party API services."
80
-
81
- status_parts.append(f"✅ Model: {model}")
82
-
83
- return True, " | ".join(status_parts)
84
-
85
-
86
- def check_openai_key() -> Tuple[bool, str]:
87
- """Legacy function - redirects to check_openai_config."""
88
- return check_openai_config()
89
-
90
-
91
- # =============================================================================
92
- # Gradio Interface Functions
93
- # =============================================================================
94
-
95
- def convert_html_to_tsv(html_input: str, title: str, caption: str) -> str:
96
- """Convert HTML table to TSV representation."""
97
- if not MCP_AVAILABLE:
98
- return "Error: MCP service not available"
99
-
100
- if not html_input.strip():
101
- return "Error: Please provide HTML table input"
102
-
103
- try:
104
- result = table_representer.html_to_tsv(html_input, title, caption)
105
- return result
106
- except Exception as e:
107
- return f"Error: {str(e)}"
108
-
109
-
110
- def convert_html_to_json(html_input: str, title: str, caption: str) -> str:
111
- """Convert HTML table to JSON representation."""
112
- if not MCP_AVAILABLE:
113
- return "Error: MCP service not available"
114
-
115
- if not html_input.strip():
116
- return "Error: Please provide HTML table input"
117
-
118
- try:
119
- result = table_to_json.html_to_json(html_input, title, caption)
120
- return format_json_output(result)
121
- except Exception as e:
122
- return f"Error: {str(e)}"
123
-
124
-
125
- def analyze_table(html_input: str) -> str:
126
- """Analyze HTML table structure."""
127
- if not MCP_AVAILABLE:
128
- return "Error: MCP service not available"
129
-
130
- if not html_input.strip():
131
- return "Error: Please provide HTML table input"
132
-
133
- try:
134
- result = table_splitter.analyze_table_structure(html_input)
135
- return format_json_output(result)
136
- except Exception as e:
137
- return f"Error: {str(e)}"
138
-
139
-
140
- def split_table(html_input: str, title: str, caption: str) -> str:
141
- """Split complex table into simpler components."""
142
- if not MCP_AVAILABLE:
143
- return "Error: MCP service not available"
144
-
145
- if not html_input.strip():
146
- return "Error: Please provide HTML table input"
147
-
148
- try:
149
- result = table_splitter.split_table(html_input, title, caption)
150
- return format_json_output({
151
- "table_count": len(result),
152
- "tables": result
153
- })
154
- except Exception as e:
155
- return f"Error: {str(e)}"
156
-
157
-
158
- def extract_zero_shot(table_repr: str) -> str:
159
- """Extract catalyst data using zero-shot approach."""
160
- if not MCP_AVAILABLE:
161
- return "Error: MCP service not available"
162
-
163
- if not table_repr.strip():
164
- return "Error: Please provide table representation"
165
-
166
- has_key, key_status = check_openai_key()
167
- if not has_key:
168
- return f"Error: {key_status}"
169
-
170
- try:
171
- extractor = get_extractor()
172
- result = extractor.extract_zero_shot(table_repr)
173
- return format_json_output(result)
174
- except Exception as e:
175
- return f"Error: {str(e)}"
176
-
177
-
178
- def extract_few_shot(table_repr: str, examples_json: str) -> str:
179
- """Extract catalyst data using few-shot approach."""
180
- if not MCP_AVAILABLE:
181
- return "Error: MCP service not available"
182
-
183
- if not table_repr.strip():
184
- return "Error: Please provide table representation"
185
-
186
- has_key, key_status = check_openai_key()
187
- if not has_key:
188
- return f"Error: {key_status}"
189
-
190
- try:
191
- examples = json.loads(examples_json) if examples_json.strip() else []
192
- extractor = get_extractor()
193
- result = extractor.extract_few_shot(table_repr, examples)
194
- return format_json_output(result)
195
- except json.JSONDecodeError:
196
- return "Error: Invalid examples JSON format"
197
- except Exception as e:
198
- return f"Error: {str(e)}"
199
-
200
-
201
- def validate_extraction(extraction_json: str) -> str:
202
- """Validate extraction result."""
203
- if not extraction_json.strip():
204
- return "Error: Please provide extraction JSON"
205
-
206
- try:
207
- extraction = json.loads(extraction_json)
208
- except json.JSONDecodeError:
209
- return "Error: Invalid JSON format"
210
-
211
- issues = []
212
- warnings = []
213
-
214
- if not isinstance(extraction, dict):
215
- return format_json_output({"valid": False, "issues": ["Extraction must be a dictionary"]})
216
-
217
- if "error" in extraction:
218
- issues.append(f"Extraction contains error: {extraction['error']}")
219
-
220
- valid_performance_types = set(GPTExtractor.PERFORMANCE_LIST)
221
-
222
- for catalyst_name, performances in extraction.items():
223
- if catalyst_name in ["error", "raw_response", "catalysts"]:
224
- continue
225
-
226
- if not isinstance(performances, dict):
227
- warnings.append(f"Catalyst '{catalyst_name}' should have dict of performances")
228
- continue
229
-
230
- for perf_name, properties in performances.items():
231
- if perf_name not in valid_performance_types:
232
- warnings.append(f"Unknown performance type: {perf_name}")
233
-
234
- if isinstance(properties, dict):
235
- for prop_key in properties.keys():
236
- if prop_key not in GPTExtractor.PROPERTY_TEMPLATE:
237
- warnings.append(f"Unknown property key: {prop_key}")
238
-
239
- return format_json_output({
240
- "valid": len(issues) == 0,
241
- "issues": issues,
242
- "warnings": warnings
243
- })
244
-
245
-
246
- def get_performance_types() -> str:
247
- """Get list of supported performance types."""
248
- return format_json_output({
249
- "performance_types": GPTExtractor.PERFORMANCE_LIST,
250
- "property_template": GPTExtractor.PROPERTY_TEMPLATE
251
- })
252
-
253
-
254
- def get_code_template(repr_format: str, model_type: str) -> str:
255
- """Generate code template for local extraction."""
256
- code = f'''"""
257
- MaTableGPT Local Extraction Template
258
- Model Type: {model_type}
259
- Representation Format: {repr_format}
260
- """
261
-
262
- from openai import OpenAI
263
- import json
264
-
265
- # Initialize client
266
- client = OpenAI(api_key="YOUR_API_KEY")
267
-
268
- # Performance types to extract
269
- PERFORMANCE_LIST = [
270
- 'overpotential', 'tafel_slope', 'Rct', 'stability', 'Cdl',
271
- 'onset_potential', 'current_density', 'potential', 'TOF', 'ECSA',
272
- 'water_splitting_potential', 'mass_activity', 'exchange_current_density',
273
- 'Rs', 'specific_activity', 'onset_overpotential', 'BET', 'surface_area',
274
- 'loading', 'apparent_activation_energy'
275
- ]
276
-
277
- # Your table representation
278
- table_representation = """
279
- # Paste your {repr_format.upper()} representation here
280
- """
281
-
282
- # System prompt
283
- system_prompt = """I will extract catalyst performance information from the table and create JSON format.
284
- Performance types: """ + str(PERFORMANCE_LIST) + """
285
- The JSON format will have performance within the catalyst, with elements:
286
- reaction type, value, electrolyte, condition, current density, versus, substrate.
287
- Output must contain only JSON dictionary."""
288
-
289
- # Extract
290
- response = client.chat.completions.create(
291
- model="gpt-4-turbo-preview",
292
- messages=[
293
- {{"role": "system", "content": system_prompt}},
294
- {{"role": "user", "content": table_representation}}
295
- ],
296
- temperature=0
297
- )
298
-
299
- result = response.choices[0].message.content.strip()
300
- print(json.dumps(json.loads(result), indent=2))
301
- '''
302
- return code
303
-
304
-
305
- # =============================================================================
306
- # Gradio UI
307
- # =============================================================================
308
-
309
- # Sample HTML table for demo
310
- SAMPLE_HTML = '''<table>
311
- <thead>
312
- <tr>
313
- <th>Catalyst</th>
314
- <th>Overpotential (mV)</th>
315
- <th>Tafel Slope (mV/dec)</th>
316
- <th>Electrolyte</th>
317
- </tr>
318
- </thead>
319
- <tbody>
320
- <tr>
321
- <td>Pt/C</td>
322
- <td>280</td>
323
- <td>65</td>
324
- <td>1M KOH</td>
325
- </tr>
326
- <tr>
327
- <td>NiFe-LDH</td>
328
- <td>230</td>
329
- <td>45</td>
330
- <td>1M KOH</td>
331
- </tr>
332
- <tr>
333
- <td>Co3O4</td>
334
- <td>350</td>
335
- <td>78</td>
336
- <td>1M KOH</td>
337
- </tr>
338
- </tbody>
339
- </table>'''
340
-
341
-
342
- def create_ui():
343
- """Create Gradio interface."""
344
-
345
- # Check status
346
- has_key, key_status = check_openai_key()
347
- status_color = "green" if has_key else "orange"
348
-
349
- with gr.Blocks(
350
- title="MaTableGPT - Table Data Extractor",
351
- theme=gr.themes.Soft()
352
- ) as app:
353
-
354
- gr.Markdown("""
355
- # 🔬 MaTableGPT - Table Data Extractor
356
-
357
- **Extract structured catalyst performance data from HTML tables in materials science literature**
358
-
359
- This tool uses GPT models to convert complex HTML tables into structured JSON data with
360
- catalyst names, performance metrics (overpotential, Tafel slope, etc.), and associated properties.
361
- """)
362
-
363
- gr.Markdown(f"**Status:** <span style='color:{status_color}'>{key_status}</span>")
364
-
365
- with gr.Tabs():
366
- # Tab 1: Table Representation
367
- with gr.TabItem("📋 Table Representation"):
368
- gr.Markdown("### Convert HTML tables to TSV or JSON format")
369
-
370
- with gr.Row():
371
- with gr.Column():
372
- html_input = gr.Textbox(
373
- label="HTML Table Input",
374
- placeholder="Paste your HTML table here...",
375
- lines=15,
376
- value=SAMPLE_HTML
377
- )
378
- title_input = gr.Textbox(
379
- label="Table Title (optional)",
380
- placeholder="e.g., Table 1: OER Catalyst Performance"
381
- )
382
- caption_input = gr.Textbox(
383
- label="Table Caption (optional)",
384
- placeholder="e.g., Performance measured at 10 mA/cm²"
385
- )
386
-
387
- with gr.Row():
388
- tsv_btn = gr.Button("Convert to TSV", variant="primary")
389
- json_btn = gr.Button("Convert to JSON", variant="primary")
390
-
391
- with gr.Column():
392
- repr_output = gr.Textbox(
393
- label="Representation Output",
394
- lines=20,
395
- show_copy_button=True
396
- )
397
-
398
- tsv_btn.click(
399
- convert_html_to_tsv,
400
- inputs=[html_input, title_input, caption_input],
401
- outputs=repr_output
402
- )
403
- json_btn.click(
404
- convert_html_to_json,
405
- inputs=[html_input, title_input, caption_input],
406
- outputs=repr_output
407
- )
408
-
409
- # Tab 2: Table Analysis & Splitting
410
- with gr.TabItem("🔍 Table Analysis"):
411
- gr.Markdown("### Analyze and split complex tables")
412
-
413
- with gr.Row():
414
- with gr.Column():
415
- html_analyze = gr.Textbox(
416
- label="HTML Table Input",
417
- placeholder="Paste your HTML table here...",
418
- lines=10,
419
- value=SAMPLE_HTML
420
- )
421
-
422
- with gr.Row():
423
- analyze_btn = gr.Button("Analyze Structure", variant="secondary")
424
- split_btn = gr.Button("Split Table", variant="secondary")
425
-
426
- with gr.Column():
427
- analysis_output = gr.Textbox(
428
- label="Analysis Result",
429
- lines=15,
430
- show_copy_button=True
431
- )
432
-
433
- analyze_btn.click(
434
- analyze_table,
435
- inputs=html_analyze,
436
- outputs=analysis_output
437
- )
438
- split_btn.click(
439
- split_table,
440
- inputs=[html_analyze, title_input, caption_input],
441
- outputs=analysis_output
442
- )
443
-
444
- # Tab 3: GPT Extraction
445
- with gr.TabItem("🤖 GPT Extraction"):
446
- gr.Markdown("### Extract catalyst data using GPT models")
447
-
448
- if not has_key:
449
- gr.Markdown("""
450
- ⚠️ **OpenAI API Key Required**
451
-
452
- Set the `OPENAI_API_KEY` environment variable to enable GPT extraction.
453
- """)
454
-
455
- with gr.Row():
456
- with gr.Column():
457
- table_repr_input = gr.Textbox(
458
- label="Table Representation (TSV or JSON)",
459
- placeholder="Paste your table representation here...",
460
- lines=10
461
- )
462
-
463
- extraction_method = gr.Radio(
464
- ["Zero-shot", "Few-shot"],
465
- label="Extraction Method",
466
- value="Zero-shot"
467
- )
468
-
469
- examples_input = gr.Textbox(
470
- label="Examples (for Few-shot, JSON format)",
471
- placeholder='[{"input": "...", "output": "..."}]',
472
- lines=5,
473
- visible=False
474
- )
475
-
476
- extract_btn = gr.Button("Extract Catalyst Data", variant="primary")
477
-
478
- with gr.Column():
479
- extraction_output = gr.Textbox(
480
- label="Extraction Result",
481
- lines=20,
482
- show_copy_button=True
483
- )
484
-
485
- def update_examples_visibility(method):
486
- return gr.update(visible=(method == "Few-shot"))
487
-
488
- extraction_method.change(
489
- update_examples_visibility,
490
- inputs=extraction_method,
491
- outputs=examples_input
492
- )
493
-
494
- def extract_data(table_repr, method, examples):
495
- if method == "Zero-shot":
496
- return extract_zero_shot(table_repr)
497
- else:
498
- return extract_few_shot(table_repr, examples)
499
-
500
- extract_btn.click(
501
- extract_data,
502
- inputs=[table_repr_input, extraction_method, examples_input],
503
- outputs=extraction_output
504
- )
505
-
506
- # Tab 4: Validation
507
- with gr.TabItem(" Validation"):
508
- gr.Markdown("### Validate extraction results")
509
-
510
- with gr.Row():
511
- with gr.Column():
512
- validation_input = gr.Textbox(
513
- label="Extraction JSON to Validate",
514
- placeholder="Paste extraction JSON here...",
515
- lines=15
516
- )
517
- validate_btn = gr.Button("Validate", variant="secondary")
518
-
519
- with gr.Column():
520
- validation_output = gr.Textbox(
521
- label="Validation Result",
522
- lines=10
523
- )
524
-
525
- gr.Markdown("### Supported Performance Types")
526
- perf_types = gr.Textbox(
527
- label="",
528
- value=get_performance_types(),
529
- lines=10,
530
- interactive=False
531
- )
532
-
533
- validate_btn.click(
534
- validate_extraction,
535
- inputs=validation_input,
536
- outputs=validation_output
537
- )
538
-
539
- # Tab 5: Code Template
540
- with gr.TabItem("💻 Code Template"):
541
- gr.Markdown("### Generate Python code for local extraction")
542
-
543
- with gr.Row():
544
- repr_format = gr.Dropdown(
545
- ["tsv", "json"],
546
- label="Representation Format",
547
- value="tsv"
548
- )
549
- model_type = gr.Dropdown(
550
- ["zero-shot", "few-shot", "fine-tuning"],
551
- label="Model Type",
552
- value="zero-shot"
553
- )
554
-
555
- generate_btn = gr.Button("Generate Code", variant="secondary")
556
-
557
- code_output = gr.Code(
558
- label="Python Code Template",
559
- language="python",
560
- lines=30
561
- )
562
-
563
- generate_btn.click(
564
- get_code_template,
565
- inputs=[repr_format, model_type],
566
- outputs=code_output
567
- )
568
-
569
- # Tab 6: About
570
- with gr.TabItem("ℹ️ About"):
571
- gr.Markdown("""
572
- ## About MaTableGPT
573
-
574
- MaTableGPT is a GPT-based table data extractor specifically designed for
575
- materials science literature. It converts complex HTML tables containing
576
- catalyst performance data into structured JSON format.
577
-
578
- ### Workflow
579
-
580
- 1. **Table Representation**: Convert HTML tables to TSV or JSON format
581
- 2. **Table Splitting** (optional): Break down complex tables with multiple headers
582
- 3. **GPT Extraction**: Use zero-shot, few-shot, or fine-tuned models to extract data
583
- 4. **Validation**: Verify extraction results against expected schema
584
-
585
- ### Supported Performance Types
586
-
587
- - Overpotential, Tafel slope, Rct, Stability, Cdl
588
- - Onset potential, Current density, Potential, TOF, ECSA
589
- - Water splitting potential, Mass activity, Exchange current density
590
- - Rs, Specific activity, Onset overpotential, BET, Surface area
591
- - Loading, Apparent activation energy
592
-
593
- ### MCP Integration
594
-
595
- This service is also available as an MCP (Model Context Protocol) server,
596
- allowing integration with AI assistants like Claude.
597
-
598
- ### Credits
599
-
600
- Based on [MaTableGPT](https://github.com/your-repo/MaTableGPT) research.
601
- """)
602
-
603
- gr.Markdown("---\n*MaTableGPT MCP Service - Materials Science Table Data Extraction*")
604
-
605
- return app
606
-
607
-
608
- # =============================================================================
609
- # Main Entry Point
610
- # =============================================================================
611
-
612
- def main():
613
- """Run the Gradio app."""
614
- app = create_ui()
615
-
616
- # Get port from environment or default
617
- port = int(os.environ.get('GRADIO_SERVER_PORT', 7860))
618
-
619
- app.launch(
620
- server_name="0.0.0.0",
621
- server_port=port,
622
- share=False
623
- )
624
-
625
-
626
- if __name__ == "__main__":
627
- main()
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ MaTableGPT Gradio Web Interface
4
+ ================================
5
+
6
+ A web interface for the MaTableGPT MCP service.
7
+ Provides an interactive UI for table data extraction from materials science literature.
8
+
9
+ For HuggingFace Spaces deployment.
10
+ """
11
+
12
+ import os
13
+ import json
14
+ import logging
15
+ import gradio as gr
16
+ from typing import Optional, Tuple, Dict, Any
17
+
18
+ # Configure logging
19
+ logging.basicConfig(level=logging.INFO)
20
+ logger = logging.getLogger("matablgpt-app")
21
+
22
+ # Import MCP service components
23
+ try:
24
+ from mcp_service import (
25
+ table_representer,
26
+ table_to_json,
27
+ table_splitter,
28
+ session_manager,
29
+ get_extractor,
30
+ GPTExtractor
31
+ )
32
+ MCP_AVAILABLE = True
33
+ except ImportError as e:
34
+ logger.warning(f"MCP service not available: {e}")
35
+ MCP_AVAILABLE = False
36
+
37
+ # =============================================================================
38
+ # Helper Functions
39
+ # =============================================================================
40
+
41
+ def format_json_output(data: Any) -> str:
42
+ """Format data as pretty JSON string."""
43
+ try:
44
+ return json.dumps(data, indent=2, ensure_ascii=False)
45
+ except:
46
+ return str(data)
47
+
48
+
49
+ def check_openai_config() -> Tuple[bool, str]:
50
+ """Check if API configuration is complete (supports third-party services)."""
51
+ # Check multiple env var names
52
+ key = (
53
+ os.environ.get('LLM_API_KEY', '') or
54
+ os.environ.get('OPENAI_API_KEY', '')
55
+ )
56
+ base_url = (
57
+ os.environ.get('LLM_API_BASE', '') or
58
+ os.environ.get('OPENAI_API_BASE', '') or
59
+ os.environ.get('OPENAI_BASE_URL', '')
60
+ )
61
+ model = (
62
+ os.environ.get('LLM_MODEL', '') or
63
+ os.environ.get('OPENAI_MODEL', '') or
64
+ 'gpt-4-turbo-preview'
65
+ )
66
+
67
+ status_parts = []
68
+
69
+ if key:
70
+ status_parts.append(f"✅ API Key: ***{key[-4:]}")
71
+ else:
72
+ return False, "⚠️ API Key not configured (set LLM_API_KEY or OPENAI_API_KEY). GPT extraction will not work."
73
+
74
+ if base_url:
75
+ # Show shortened URL
76
+ display_url = base_url if len(base_url) <= 35 else base_url[:32] + "..."
77
+ status_parts.append(f"✅ API URL: {display_url}")
78
+ else:
79
+ return False, "⚠️ API Base URL not configured (set LLM_API_BASE or OPENAI_API_BASE). Required for third-party API services."
80
+
81
+ status_parts.append(f"✅ Model: {model}")
82
+
83
+ return True, " | ".join(status_parts)
84
+
85
+
86
+ def check_openai_key() -> Tuple[bool, str]:
87
+ """Legacy function - redirects to check_openai_config."""
88
+ return check_openai_config()
89
+
90
+
91
+ # =============================================================================
92
+ # Gradio Interface Functions
93
+ # =============================================================================
94
+
95
+ def convert_html_to_tsv(html_input: str, title: str, caption: str) -> str:
96
+ """Convert HTML table to TSV representation."""
97
+ if not MCP_AVAILABLE:
98
+ return "Error: MCP service not available"
99
+
100
+ if not html_input.strip():
101
+ return "Error: Please provide HTML table input"
102
+
103
+ try:
104
+ result = table_representer.html_to_tsv(html_input, title, caption)
105
+ return result
106
+ except Exception as e:
107
+ return f"Error: {str(e)}"
108
+
109
+
110
+ def convert_html_to_json(html_input: str, title: str, caption: str) -> str:
111
+ """Convert HTML table to JSON representation."""
112
+ if not MCP_AVAILABLE:
113
+ return "Error: MCP service not available"
114
+
115
+ if not html_input.strip():
116
+ return "Error: Please provide HTML table input"
117
+
118
+ try:
119
+ result = table_to_json.html_to_json(html_input, title, caption)
120
+ return format_json_output(result)
121
+ except Exception as e:
122
+ return f"Error: {str(e)}"
123
+
124
+
125
+ def analyze_table(html_input: str) -> str:
126
+ """Analyze HTML table structure."""
127
+ if not MCP_AVAILABLE:
128
+ return "Error: MCP service not available"
129
+
130
+ if not html_input.strip():
131
+ return "Error: Please provide HTML table input"
132
+
133
+ try:
134
+ result = table_splitter.analyze_table_structure(html_input)
135
+ return format_json_output(result)
136
+ except Exception as e:
137
+ return f"Error: {str(e)}"
138
+
139
+
140
+ def split_table(html_input: str, title: str, caption: str) -> str:
141
+ """Split complex table into simpler components."""
142
+ if not MCP_AVAILABLE:
143
+ return "Error: MCP service not available"
144
+
145
+ if not html_input.strip():
146
+ return "Error: Please provide HTML table input"
147
+
148
+ try:
149
+ result = table_splitter.split_table(html_input, title, caption)
150
+ return format_json_output({
151
+ "table_count": len(result),
152
+ "tables": result
153
+ })
154
+ except Exception as e:
155
+ return f"Error: {str(e)}"
156
+
157
+
158
+ def extract_zero_shot(table_repr: str) -> str:
159
+ """Extract catalyst data using zero-shot approach."""
160
+ if not MCP_AVAILABLE:
161
+ return "Error: MCP service not available"
162
+
163
+ if not table_repr.strip():
164
+ return "Error: Please provide table representation"
165
+
166
+ has_key, key_status = check_openai_key()
167
+ if not has_key:
168
+ return f"Error: {key_status}"
169
+
170
+ try:
171
+ extractor = get_extractor()
172
+ result = extractor.extract_zero_shot(table_repr)
173
+ return format_json_output(result)
174
+ except Exception as e:
175
+ return f"Error: {str(e)}"
176
+
177
+
178
+ def extract_few_shot(table_repr: str, examples_json: str) -> str:
179
+ """Extract catalyst data using few-shot approach."""
180
+ if not MCP_AVAILABLE:
181
+ return "Error: MCP service not available"
182
+
183
+ if not table_repr.strip():
184
+ return "Error: Please provide table representation"
185
+
186
+ has_key, key_status = check_openai_key()
187
+ if not has_key:
188
+ return f"Error: {key_status}"
189
+
190
+ try:
191
+ examples = json.loads(examples_json) if examples_json.strip() else []
192
+ extractor = get_extractor()
193
+ result = extractor.extract_few_shot(table_repr, examples)
194
+ return format_json_output(result)
195
+ except json.JSONDecodeError:
196
+ return "Error: Invalid examples JSON format"
197
+ except Exception as e:
198
+ return f"Error: {str(e)}"
199
+
200
+
201
+ def validate_extraction(extraction_json: str) -> str:
202
+ """Validate extraction result."""
203
+ if not extraction_json.strip():
204
+ return "Error: Please provide extraction JSON"
205
+
206
+ try:
207
+ extraction = json.loads(extraction_json)
208
+ except json.JSONDecodeError:
209
+ return "Error: Invalid JSON format"
210
+
211
+ issues = []
212
+ warnings = []
213
+
214
+ if not isinstance(extraction, dict):
215
+ return format_json_output({"valid": False, "issues": ["Extraction must be a dictionary"]})
216
+
217
+ if "error" in extraction:
218
+ issues.append(f"Extraction contains error: {extraction['error']}")
219
+
220
+ valid_performance_types = set(GPTExtractor.PERFORMANCE_LIST)
221
+
222
+ for catalyst_name, performances in extraction.items():
223
+ if catalyst_name in ["error", "raw_response", "catalysts"]:
224
+ continue
225
+
226
+ if not isinstance(performances, dict):
227
+ warnings.append(f"Catalyst '{catalyst_name}' should have dict of performances")
228
+ continue
229
+
230
+ for perf_name, properties in performances.items():
231
+ if perf_name not in valid_performance_types:
232
+ warnings.append(f"Unknown performance type: {perf_name}")
233
+
234
+ if isinstance(properties, dict):
235
+ for prop_key in properties.keys():
236
+ if prop_key not in GPTExtractor.PROPERTY_TEMPLATE:
237
+ warnings.append(f"Unknown property key: {prop_key}")
238
+
239
+ return format_json_output({
240
+ "valid": len(issues) == 0,
241
+ "issues": issues,
242
+ "warnings": warnings
243
+ })
244
+
245
+
246
+ def get_performance_types() -> str:
247
+ """Get list of supported performance types."""
248
+ return format_json_output({
249
+ "performance_types": GPTExtractor.PERFORMANCE_LIST,
250
+ "property_template": GPTExtractor.PROPERTY_TEMPLATE
251
+ })
252
+
253
+
254
+ def get_code_template(repr_format: str, model_type: str) -> str:
255
+ """Generate code template for local extraction."""
256
+ code = f'''"""
257
+ MaTableGPT Local Extraction Template
258
+ Model Type: {model_type}
259
+ Representation Format: {repr_format}
260
+ """
261
+
262
+ from openai import OpenAI
263
+ import json
264
+
265
+ # Initialize client
266
+ client = OpenAI(api_key="YOUR_API_KEY")
267
+
268
+ # Performance types to extract
269
+ PERFORMANCE_LIST = [
270
+ 'overpotential', 'tafel_slope', 'Rct', 'stability', 'Cdl',
271
+ 'onset_potential', 'current_density', 'potential', 'TOF', 'ECSA',
272
+ 'water_splitting_potential', 'mass_activity', 'exchange_current_density',
273
+ 'Rs', 'specific_activity', 'onset_overpotential', 'BET', 'surface_area',
274
+ 'loading', 'apparent_activation_energy'
275
+ ]
276
+
277
+ # Your table representation
278
+ table_representation = """
279
+ # Paste your {repr_format.upper()} representation here
280
+ """
281
+
282
+ # System prompt
283
+ system_prompt = """I will extract catalyst performance information from the table and create JSON format.
284
+ Performance types: """ + str(PERFORMANCE_LIST) + """
285
+ The JSON format will have performance within the catalyst, with elements:
286
+ reaction type, value, electrolyte, condition, current density, versus, substrate.
287
+ Output must contain only JSON dictionary."""
288
+
289
+ # Extract
290
+ response = client.chat.completions.create(
291
+ model="gpt-4-turbo-preview",
292
+ messages=[
293
+ {{"role": "system", "content": system_prompt}},
294
+ {{"role": "user", "content": table_representation}}
295
+ ],
296
+ temperature=0
297
+ )
298
+
299
+ result = response.choices[0].message.content.strip()
300
+ print(json.dumps(json.loads(result), indent=2))
301
+ '''
302
+ return code
303
+
304
+
305
+ # =============================================================================
306
+ # Gradio UI
307
+ # =============================================================================
308
+
309
+ # Sample HTML table for demo
310
+ SAMPLE_HTML = '''<table>
311
+ <thead>
312
+ <tr>
313
+ <th>Catalyst</th>
314
+ <th>Overpotential (mV)</th>
315
+ <th>Tafel Slope (mV/dec)</th>
316
+ <th>Electrolyte</th>
317
+ </tr>
318
+ </thead>
319
+ <tbody>
320
+ <tr>
321
+ <td>Pt/C</td>
322
+ <td>280</td>
323
+ <td>65</td>
324
+ <td>1M KOH</td>
325
+ </tr>
326
+ <tr>
327
+ <td>NiFe-LDH</td>
328
+ <td>230</td>
329
+ <td>45</td>
330
+ <td>1M KOH</td>
331
+ </tr>
332
+ <tr>
333
+ <td>Co3O4</td>
334
+ <td>350</td>
335
+ <td>78</td>
336
+ <td>1M KOH</td>
337
+ </tr>
338
+ </tbody>
339
+ </table>'''
340
+
341
+
342
+ def create_ui():
343
+ """Create Gradio interface."""
344
+
345
+ # Check status
346
+ has_key, key_status = check_openai_key()
347
+ status_color = "green" if has_key else "orange"
348
+
349
+ with gr.Blocks(
350
+ title="MaTableGPT - Table Data Extractor"
351
+ ) as app:
352
+
353
+ gr.Markdown("""
354
+ # 🔬 MaTableGPT - Table Data Extractor
355
+
356
+ **Extract structured catalyst performance data from HTML tables in materials science literature**
357
+
358
+ This tool uses GPT models to convert complex HTML tables into structured JSON data with
359
+ catalyst names, performance metrics (overpotential, Tafel slope, etc.), and associated properties.
360
+ """)
361
+
362
+ gr.Markdown(f"**Status:** <span style='color:{status_color}'>{key_status}</span>")
363
+
364
+ with gr.Tabs():
365
+ # Tab 1: Table Representation
366
+ with gr.TabItem("📋 Table Representation"):
367
+ gr.Markdown("### Convert HTML tables to TSV or JSON format")
368
+
369
+ with gr.Row():
370
+ with gr.Column():
371
+ html_input = gr.Textbox(
372
+ label="HTML Table Input",
373
+ placeholder="Paste your HTML table here...",
374
+ lines=15,
375
+ value=SAMPLE_HTML
376
+ )
377
+ title_input = gr.Textbox(
378
+ label="Table Title (optional)",
379
+ placeholder="e.g., Table 1: OER Catalyst Performance"
380
+ )
381
+ caption_input = gr.Textbox(
382
+ label="Table Caption (optional)",
383
+ placeholder="e.g., Performance measured at 10 mA/cm²"
384
+ )
385
+
386
+ with gr.Row():
387
+ tsv_btn = gr.Button("Convert to TSV", variant="primary")
388
+ json_btn = gr.Button("Convert to JSON", variant="primary")
389
+
390
+ with gr.Column():
391
+ repr_output = gr.Textbox(
392
+ label="Representation Output",
393
+ lines=20,
394
+ show_copy_button=True
395
+ )
396
+
397
+ tsv_btn.click(
398
+ convert_html_to_tsv,
399
+ inputs=[html_input, title_input, caption_input],
400
+ outputs=repr_output
401
+ )
402
+ json_btn.click(
403
+ convert_html_to_json,
404
+ inputs=[html_input, title_input, caption_input],
405
+ outputs=repr_output
406
+ )
407
+
408
+ # Tab 2: Table Analysis & Splitting
409
+ with gr.TabItem("🔍 Table Analysis"):
410
+ gr.Markdown("### Analyze and split complex tables")
411
+
412
+ with gr.Row():
413
+ with gr.Column():
414
+ html_analyze = gr.Textbox(
415
+ label="HTML Table Input",
416
+ placeholder="Paste your HTML table here...",
417
+ lines=10,
418
+ value=SAMPLE_HTML
419
+ )
420
+
421
+ with gr.Row():
422
+ analyze_btn = gr.Button("Analyze Structure", variant="secondary")
423
+ split_btn = gr.Button("Split Table", variant="secondary")
424
+
425
+ with gr.Column():
426
+ analysis_output = gr.Textbox(
427
+ label="Analysis Result",
428
+ lines=15,
429
+ show_copy_button=True
430
+ )
431
+
432
+ analyze_btn.click(
433
+ analyze_table,
434
+ inputs=html_analyze,
435
+ outputs=analysis_output
436
+ )
437
+ split_btn.click(
438
+ split_table,
439
+ inputs=[html_analyze, title_input, caption_input],
440
+ outputs=analysis_output
441
+ )
442
+
443
+ # Tab 3: GPT Extraction
444
+ with gr.TabItem("🤖 GPT Extraction"):
445
+ gr.Markdown("### Extract catalyst data using GPT models")
446
+
447
+ if not has_key:
448
+ gr.Markdown("""
449
+ ⚠️ **OpenAI API Key Required**
450
+
451
+ Set the `OPENAI_API_KEY` environment variable to enable GPT extraction.
452
+ """)
453
+
454
+ with gr.Row():
455
+ with gr.Column():
456
+ table_repr_input = gr.Textbox(
457
+ label="Table Representation (TSV or JSON)",
458
+ placeholder="Paste your table representation here...",
459
+ lines=10
460
+ )
461
+
462
+ extraction_method = gr.Radio(
463
+ ["Zero-shot", "Few-shot"],
464
+ label="Extraction Method",
465
+ value="Zero-shot"
466
+ )
467
+
468
+ examples_input = gr.Textbox(
469
+ label="Examples (for Few-shot, JSON format)",
470
+ placeholder='[{"input": "...", "output": "..."}]',
471
+ lines=5,
472
+ visible=False
473
+ )
474
+
475
+ extract_btn = gr.Button("Extract Catalyst Data", variant="primary")
476
+
477
+ with gr.Column():
478
+ extraction_output = gr.Textbox(
479
+ label="Extraction Result",
480
+ lines=20,
481
+ show_copy_button=True
482
+ )
483
+
484
+ def update_examples_visibility(method):
485
+ return gr.update(visible=(method == "Few-shot"))
486
+
487
+ extraction_method.change(
488
+ update_examples_visibility,
489
+ inputs=extraction_method,
490
+ outputs=examples_input
491
+ )
492
+
493
+ def extract_data(table_repr, method, examples):
494
+ if method == "Zero-shot":
495
+ return extract_zero_shot(table_repr)
496
+ else:
497
+ return extract_few_shot(table_repr, examples)
498
+
499
+ extract_btn.click(
500
+ extract_data,
501
+ inputs=[table_repr_input, extraction_method, examples_input],
502
+ outputs=extraction_output
503
+ )
504
+
505
+ # Tab 4: Validation
506
+ with gr.TabItem("✅ Validation"):
507
+ gr.Markdown("### Validate extraction results")
508
+
509
+ with gr.Row():
510
+ with gr.Column():
511
+ validation_input = gr.Textbox(
512
+ label="Extraction JSON to Validate",
513
+ placeholder="Paste extraction JSON here...",
514
+ lines=15
515
+ )
516
+ validate_btn = gr.Button("Validate", variant="secondary")
517
+
518
+ with gr.Column():
519
+ validation_output = gr.Textbox(
520
+ label="Validation Result",
521
+ lines=10
522
+ )
523
+
524
+ gr.Markdown("### Supported Performance Types")
525
+ perf_types = gr.Textbox(
526
+ label="",
527
+ value=get_performance_types(),
528
+ lines=10,
529
+ interactive=False
530
+ )
531
+
532
+ validate_btn.click(
533
+ validate_extraction,
534
+ inputs=validation_input,
535
+ outputs=validation_output
536
+ )
537
+
538
+ # Tab 5: Code Template
539
+ with gr.TabItem("💻 Code Template"):
540
+ gr.Markdown("### Generate Python code for local extraction")
541
+
542
+ with gr.Row():
543
+ repr_format = gr.Dropdown(
544
+ ["tsv", "json"],
545
+ label="Representation Format",
546
+ value="tsv"
547
+ )
548
+ model_type = gr.Dropdown(
549
+ ["zero-shot", "few-shot", "fine-tuning"],
550
+ label="Model Type",
551
+ value="zero-shot"
552
+ )
553
+
554
+ generate_btn = gr.Button("Generate Code", variant="secondary")
555
+
556
+ code_output = gr.Code(
557
+ label="Python Code Template",
558
+ language="python",
559
+ lines=30
560
+ )
561
+
562
+ generate_btn.click(
563
+ get_code_template,
564
+ inputs=[repr_format, model_type],
565
+ outputs=code_output
566
+ )
567
+
568
+ # Tab 6: About
569
+ with gr.TabItem("ℹ️ About"):
570
+ gr.Markdown("""
571
+ ## About MaTableGPT
572
+
573
+ MaTableGPT is a GPT-based table data extractor specifically designed for
574
+ materials science literature. It converts complex HTML tables containing
575
+ catalyst performance data into structured JSON format.
576
+
577
+ ### Workflow
578
+
579
+ 1. **Table Representation**: Convert HTML tables to TSV or JSON format
580
+ 2. **Table Splitting** (optional): Break down complex tables with multiple headers
581
+ 3. **GPT Extraction**: Use zero-shot, few-shot, or fine-tuned models to extract data
582
+ 4. **Validation**: Verify extraction results against expected schema
583
+
584
+ ### Supported Performance Types
585
+
586
+ - Overpotential, Tafel slope, Rct, Stability, Cdl
587
+ - Onset potential, Current density, Potential, TOF, ECSA
588
+ - Water splitting potential, Mass activity, Exchange current density
589
+ - Rs, Specific activity, Onset overpotential, BET, Surface area
590
+ - Loading, Apparent activation energy
591
+
592
+ ### MCP Integration
593
+
594
+ This service is also available as an MCP (Model Context Protocol) server,
595
+ allowing integration with AI assistants like Claude.
596
+
597
+ ### Credits
598
+
599
+ Based on [MaTableGPT](https://github.com/your-repo/MaTableGPT) research.
600
+ """)
601
+
602
+ gr.Markdown("---\n*MaTableGPT MCP Service - Materials Science Table Data Extraction*")
603
+
604
+ return app
605
+
606
+
607
+ # =============================================================================
608
+ # Main Entry Point
609
+ # =============================================================================
610
+
611
+ def main():
612
+ """Run the Gradio app."""
613
+ app = create_ui()
614
+
615
+ # Get port from environment or default
616
+ port = int(os.environ.get('GRADIO_SERVER_PORT', 7860))
617
+
618
+ app.launch(
619
+ server_name="0.0.0.0",
620
+ server_port=port,
621
+ share=False
622
+ )
623
+
624
+
625
+ if __name__ == "__main__":
626
+ main()