Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| MaTableGPT Gradio Web Interface | |
| ================================ | |
| A web interface for the MaTableGPT MCP service. | |
| Provides an interactive UI for table data extraction from materials science literature. | |
| For HuggingFace Spaces deployment. | |
| """ | |
| import os | |
| import json | |
| import logging | |
| import gradio as gr | |
| from typing import Optional, Tuple, Dict, Any | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger("matablgpt-app") | |
| # Import MCP service components | |
| try: | |
| from mcp_service import ( | |
| table_representer, | |
| table_to_json, | |
| table_splitter, | |
| session_manager, | |
| get_extractor, | |
| GPTExtractor | |
| ) | |
| MCP_AVAILABLE = True | |
| except ImportError as e: | |
| logger.warning(f"MCP service not available: {e}") | |
| MCP_AVAILABLE = False | |
| # ============================================================================= | |
| # Helper Functions | |
| # ============================================================================= | |
| def format_json_output(data: Any) -> str: | |
| """Format data as pretty JSON string.""" | |
| try: | |
| return json.dumps(data, indent=2, ensure_ascii=False) | |
| except: | |
| return str(data) | |
| def check_openai_config() -> Tuple[bool, str]: | |
| """Check if API configuration is complete (supports third-party services).""" | |
| # Check multiple env var names | |
| key = ( | |
| os.environ.get('LLM_API_KEY', '') or | |
| os.environ.get('OPENAI_API_KEY', '') | |
| ) | |
| base_url = ( | |
| os.environ.get('LLM_API_BASE', '') or | |
| os.environ.get('OPENAI_API_BASE', '') or | |
| os.environ.get('OPENAI_BASE_URL', '') | |
| ) | |
| model = ( | |
| os.environ.get('LLM_MODEL', '') or | |
| os.environ.get('OPENAI_MODEL', '') or | |
| 'gpt-4-turbo-preview' | |
| ) | |
| status_parts = [] | |
| if key: | |
| status_parts.append(f"✅ API Key: ***{key[-4:]}") | |
| else: | |
| return False, "⚠️ API Key not configured (set LLM_API_KEY or OPENAI_API_KEY). GPT extraction will not work." | |
| if base_url: | |
| # Show shortened URL | |
| display_url = base_url if len(base_url) <= 35 else base_url[:32] + "..." | |
| status_parts.append(f"✅ API URL: {display_url}") | |
| else: | |
| return False, "⚠️ API Base URL not configured (set LLM_API_BASE or OPENAI_API_BASE). Required for third-party API services." | |
| status_parts.append(f"✅ Model: {model}") | |
| return True, " | ".join(status_parts) | |
| def check_openai_key() -> Tuple[bool, str]: | |
| """Legacy function - redirects to check_openai_config.""" | |
| return check_openai_config() | |
| # ============================================================================= | |
| # Gradio Interface Functions | |
| # ============================================================================= | |
| def convert_html_to_tsv(html_input: str, title: str, caption: str) -> str: | |
| """Convert HTML table to TSV representation.""" | |
| if not MCP_AVAILABLE: | |
| return "Error: MCP service not available" | |
| if not html_input.strip(): | |
| return "Error: Please provide HTML table input" | |
| try: | |
| result = table_representer.html_to_tsv(html_input, title, caption) | |
| return result | |
| except Exception as e: | |
| return f"Error: {str(e)}" | |
| def convert_html_to_json(html_input: str, title: str, caption: str) -> str: | |
| """Convert HTML table to JSON representation.""" | |
| if not MCP_AVAILABLE: | |
| return "Error: MCP service not available" | |
| if not html_input.strip(): | |
| return "Error: Please provide HTML table input" | |
| try: | |
| result = table_to_json.html_to_json(html_input, title, caption) | |
| return format_json_output(result) | |
| except Exception as e: | |
| return f"Error: {str(e)}" | |
| def analyze_table(html_input: str) -> str: | |
| """Analyze HTML table structure.""" | |
| if not MCP_AVAILABLE: | |
| return "Error: MCP service not available" | |
| if not html_input.strip(): | |
| return "Error: Please provide HTML table input" | |
| try: | |
| result = table_splitter.analyze_table_structure(html_input) | |
| return format_json_output(result) | |
| except Exception as e: | |
| return f"Error: {str(e)}" | |
| def split_table(html_input: str, title: str, caption: str) -> str: | |
| """Split complex table into simpler components.""" | |
| if not MCP_AVAILABLE: | |
| return "Error: MCP service not available" | |
| if not html_input.strip(): | |
| return "Error: Please provide HTML table input" | |
| try: | |
| result = table_splitter.split_table(html_input, title, caption) | |
| return format_json_output({ | |
| "table_count": len(result), | |
| "tables": result | |
| }) | |
| except Exception as e: | |
| return f"Error: {str(e)}" | |
| def extract_zero_shot(table_repr: str) -> str: | |
| """Extract catalyst data using zero-shot approach.""" | |
| if not MCP_AVAILABLE: | |
| return "Error: MCP service not available" | |
| if not table_repr.strip(): | |
| return "Error: Please provide table representation" | |
| has_key, key_status = check_openai_key() | |
| if not has_key: | |
| return f"Error: {key_status}" | |
| try: | |
| extractor = get_extractor() | |
| result = extractor.extract_zero_shot(table_repr) | |
| return format_json_output(result) | |
| except Exception as e: | |
| return f"Error: {str(e)}" | |
| def extract_few_shot(table_repr: str, examples_json: str) -> str: | |
| """Extract catalyst data using few-shot approach.""" | |
| if not MCP_AVAILABLE: | |
| return "Error: MCP service not available" | |
| if not table_repr.strip(): | |
| return "Error: Please provide table representation" | |
| has_key, key_status = check_openai_key() | |
| if not has_key: | |
| return f"Error: {key_status}" | |
| try: | |
| examples = json.loads(examples_json) if examples_json.strip() else [] | |
| extractor = get_extractor() | |
| result = extractor.extract_few_shot(table_repr, examples) | |
| return format_json_output(result) | |
| except json.JSONDecodeError: | |
| return "Error: Invalid examples JSON format" | |
| except Exception as e: | |
| return f"Error: {str(e)}" | |
| def validate_extraction(extraction_json: str) -> str: | |
| """Validate extraction result.""" | |
| if not extraction_json.strip(): | |
| return "Error: Please provide extraction JSON" | |
| try: | |
| extraction = json.loads(extraction_json) | |
| except json.JSONDecodeError: | |
| return "Error: Invalid JSON format" | |
| issues = [] | |
| warnings = [] | |
| if not isinstance(extraction, dict): | |
| return format_json_output({"valid": False, "issues": ["Extraction must be a dictionary"]}) | |
| if "error" in extraction: | |
| issues.append(f"Extraction contains error: {extraction['error']}") | |
| valid_performance_types = set(GPTExtractor.PERFORMANCE_LIST) | |
| for catalyst_name, performances in extraction.items(): | |
| if catalyst_name in ["error", "raw_response", "catalysts"]: | |
| continue | |
| if not isinstance(performances, dict): | |
| warnings.append(f"Catalyst '{catalyst_name}' should have dict of performances") | |
| continue | |
| for perf_name, properties in performances.items(): | |
| if perf_name not in valid_performance_types: | |
| warnings.append(f"Unknown performance type: {perf_name}") | |
| if isinstance(properties, dict): | |
| for prop_key in properties.keys(): | |
| if prop_key not in GPTExtractor.PROPERTY_TEMPLATE: | |
| warnings.append(f"Unknown property key: {prop_key}") | |
| return format_json_output({ | |
| "valid": len(issues) == 0, | |
| "issues": issues, | |
| "warnings": warnings | |
| }) | |
| def get_performance_types() -> str: | |
| """Get list of supported performance types.""" | |
| return format_json_output({ | |
| "performance_types": GPTExtractor.PERFORMANCE_LIST, | |
| "property_template": GPTExtractor.PROPERTY_TEMPLATE | |
| }) | |
| def get_code_template(repr_format: str, model_type: str) -> str: | |
| """Generate code template for local extraction.""" | |
| code = f'''""" | |
| MaTableGPT Local Extraction Template | |
| Model Type: {model_type} | |
| Representation Format: {repr_format} | |
| """ | |
| from openai import OpenAI | |
| import json | |
| # Initialize client | |
| client = OpenAI(api_key="YOUR_API_KEY") | |
| # Performance types to extract | |
| PERFORMANCE_LIST = [ | |
| 'overpotential', 'tafel_slope', 'Rct', 'stability', 'Cdl', | |
| 'onset_potential', 'current_density', 'potential', 'TOF', 'ECSA', | |
| 'water_splitting_potential', 'mass_activity', 'exchange_current_density', | |
| 'Rs', 'specific_activity', 'onset_overpotential', 'BET', 'surface_area', | |
| 'loading', 'apparent_activation_energy' | |
| ] | |
| # Your table representation | |
| table_representation = """ | |
| # Paste your {repr_format.upper()} representation here | |
| """ | |
| # System prompt | |
| system_prompt = """I will extract catalyst performance information from the table and create JSON format. | |
| Performance types: """ + str(PERFORMANCE_LIST) + """ | |
| The JSON format will have performance within the catalyst, with elements: | |
| reaction type, value, electrolyte, condition, current density, versus, substrate. | |
| Output must contain only JSON dictionary.""" | |
| # Extract | |
| response = client.chat.completions.create( | |
| model="gpt-4-turbo-preview", | |
| messages=[ | |
| {{"role": "system", "content": system_prompt}}, | |
| {{"role": "user", "content": table_representation}} | |
| ], | |
| temperature=0 | |
| ) | |
| result = response.choices[0].message.content.strip() | |
| print(json.dumps(json.loads(result), indent=2)) | |
| ''' | |
| return code | |
| # ============================================================================= | |
| # Gradio UI | |
| # ============================================================================= | |
| # Sample HTML table for demo | |
| SAMPLE_HTML = '''<table> | |
| <thead> | |
| <tr> | |
| <th>Catalyst</th> | |
| <th>Overpotential (mV)</th> | |
| <th>Tafel Slope (mV/dec)</th> | |
| <th>Electrolyte</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr> | |
| <td>Pt/C</td> | |
| <td>280</td> | |
| <td>65</td> | |
| <td>1M KOH</td> | |
| </tr> | |
| <tr> | |
| <td>NiFe-LDH</td> | |
| <td>230</td> | |
| <td>45</td> | |
| <td>1M KOH</td> | |
| </tr> | |
| <tr> | |
| <td>Co3O4</td> | |
| <td>350</td> | |
| <td>78</td> | |
| <td>1M KOH</td> | |
| </tr> | |
| </tbody> | |
| </table>''' | |
| def create_ui(): | |
| """Create Gradio interface.""" | |
| # Check status | |
| has_key, key_status = check_openai_key() | |
| status_color = "green" if has_key else "orange" | |
| with gr.Blocks( | |
| title="MaTableGPT - Table Data Extractor", | |
| theme="soft" | |
| ) as app: | |
| gr.Markdown(""" | |
| # 🔬 MaTableGPT - Table Data Extractor | |
| **Extract structured catalyst performance data from HTML tables in materials science literature** | |
| This tool uses GPT models to convert complex HTML tables into structured JSON data with | |
| catalyst names, performance metrics (overpotential, Tafel slope, etc.), and associated properties. | |
| """) | |
| gr.Markdown(f"**Status:** <span style='color:{status_color}'>{key_status}</span>") | |
| with gr.Tabs(): | |
| # Tab 1: Table Representation | |
| with gr.TabItem("📋 Table Representation"): | |
| gr.Markdown("### Convert HTML tables to TSV or JSON format") | |
| with gr.Row(): | |
| with gr.Column(): | |
| html_input = gr.Textbox( | |
| label="HTML Table Input", | |
| placeholder="Paste your HTML table here...", | |
| lines=15, | |
| value=SAMPLE_HTML | |
| ) | |
| title_input = gr.Textbox( | |
| label="Table Title (optional)", | |
| placeholder="e.g., Table 1: OER Catalyst Performance" | |
| ) | |
| caption_input = gr.Textbox( | |
| label="Table Caption (optional)", | |
| placeholder="e.g., Performance measured at 10 mA/cm²" | |
| ) | |
| with gr.Row(): | |
| tsv_btn = gr.Button("Convert to TSV", variant="primary") | |
| json_btn = gr.Button("Convert to JSON", variant="primary") | |
| with gr.Column(): | |
| repr_output = gr.Textbox( | |
| label="Representation Output", | |
| lines=20, | |
| show_copy_button=True | |
| ) | |
| tsv_btn.click( | |
| convert_html_to_tsv, | |
| inputs=[html_input, title_input, caption_input], | |
| outputs=repr_output | |
| ) | |
| json_btn.click( | |
| convert_html_to_json, | |
| inputs=[html_input, title_input, caption_input], | |
| outputs=repr_output | |
| ) | |
| # Tab 2: Table Analysis & Splitting | |
| with gr.TabItem("🔍 Table Analysis"): | |
| gr.Markdown("### Analyze and split complex tables") | |
| with gr.Row(): | |
| with gr.Column(): | |
| html_analyze = gr.Textbox( | |
| label="HTML Table Input", | |
| placeholder="Paste your HTML table here...", | |
| lines=10, | |
| value=SAMPLE_HTML | |
| ) | |
| with gr.Row(): | |
| analyze_btn = gr.Button("Analyze Structure", variant="secondary") | |
| split_btn = gr.Button("Split Table", variant="secondary") | |
| with gr.Column(): | |
| analysis_output = gr.Textbox( | |
| label="Analysis Result", | |
| lines=15, | |
| show_copy_button=True | |
| ) | |
| analyze_btn.click( | |
| analyze_table, | |
| inputs=html_analyze, | |
| outputs=analysis_output | |
| ) | |
| split_btn.click( | |
| split_table, | |
| inputs=[html_analyze, title_input, caption_input], | |
| outputs=analysis_output | |
| ) | |
| # Tab 3: GPT Extraction | |
| with gr.TabItem("🤖 GPT Extraction"): | |
| gr.Markdown("### Extract catalyst data using GPT models") | |
| if not has_key: | |
| gr.Markdown(""" | |
| ⚠️ **OpenAI API Key Required** | |
| Set the `OPENAI_API_KEY` environment variable to enable GPT extraction. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| table_repr_input = gr.Textbox( | |
| label="Table Representation (TSV or JSON)", | |
| placeholder="Paste your table representation here...", | |
| lines=10 | |
| ) | |
| extraction_method = gr.Radio( | |
| ["Zero-shot", "Few-shot"], | |
| label="Extraction Method", | |
| value="Zero-shot" | |
| ) | |
| examples_input = gr.Textbox( | |
| label="Examples (for Few-shot, JSON format)", | |
| placeholder='[{"input": "...", "output": "..."}]', | |
| lines=5, | |
| visible=False | |
| ) | |
| extract_btn = gr.Button("Extract Catalyst Data", variant="primary") | |
| with gr.Column(): | |
| extraction_output = gr.Textbox( | |
| label="Extraction Result", | |
| lines=20, | |
| show_copy_button=True | |
| ) | |
| def update_examples_visibility(method): | |
| return gr.update(visible=(method == "Few-shot")) | |
| extraction_method.change( | |
| update_examples_visibility, | |
| inputs=extraction_method, | |
| outputs=examples_input | |
| ) | |
| def extract_data(table_repr, method, examples): | |
| if method == "Zero-shot": | |
| return extract_zero_shot(table_repr) | |
| else: | |
| return extract_few_shot(table_repr, examples) | |
| extract_btn.click( | |
| extract_data, | |
| inputs=[table_repr_input, extraction_method, examples_input], | |
| outputs=extraction_output | |
| ) | |
| # Tab 4: Validation | |
| with gr.TabItem("✅ Validation"): | |
| gr.Markdown("### Validate extraction results") | |
| with gr.Row(): | |
| with gr.Column(): | |
| validation_input = gr.Textbox( | |
| label="Extraction JSON to Validate", | |
| placeholder="Paste extraction JSON here...", | |
| lines=15 | |
| ) | |
| validate_btn = gr.Button("Validate", variant="secondary") | |
| with gr.Column(): | |
| validation_output = gr.Textbox( | |
| label="Validation Result", | |
| lines=10 | |
| ) | |
| gr.Markdown("### Supported Performance Types") | |
| perf_types = gr.Textbox( | |
| label="", | |
| value=get_performance_types(), | |
| lines=10, | |
| interactive=False | |
| ) | |
| validate_btn.click( | |
| validate_extraction, | |
| inputs=validation_input, | |
| outputs=validation_output | |
| ) | |
| # Tab 5: Code Template | |
| with gr.TabItem("💻 Code Template"): | |
| gr.Markdown("### Generate Python code for local extraction") | |
| with gr.Row(): | |
| repr_format = gr.Dropdown( | |
| ["tsv", "json"], | |
| label="Representation Format", | |
| value="tsv" | |
| ) | |
| model_type = gr.Dropdown( | |
| ["zero-shot", "few-shot", "fine-tuning"], | |
| label="Model Type", | |
| value="zero-shot" | |
| ) | |
| generate_btn = gr.Button("Generate Code", variant="secondary") | |
| code_output = gr.Code( | |
| label="Python Code Template", | |
| language="python", | |
| lines=30 | |
| ) | |
| generate_btn.click( | |
| get_code_template, | |
| inputs=[repr_format, model_type], | |
| outputs=code_output | |
| ) | |
| # Tab 6: About | |
| with gr.TabItem("ℹ️ About"): | |
| gr.Markdown(""" | |
| ## About MaTableGPT | |
| MaTableGPT is a GPT-based table data extractor specifically designed for | |
| materials science literature. It converts complex HTML tables containing | |
| catalyst performance data into structured JSON format. | |
| ### Workflow | |
| 1. **Table Representation**: Convert HTML tables to TSV or JSON format | |
| 2. **Table Splitting** (optional): Break down complex tables with multiple headers | |
| 3. **GPT Extraction**: Use zero-shot, few-shot, or fine-tuned models to extract data | |
| 4. **Validation**: Verify extraction results against expected schema | |
| ### Supported Performance Types | |
| - Overpotential, Tafel slope, Rct, Stability, Cdl | |
| - Onset potential, Current density, Potential, TOF, ECSA | |
| - Water splitting potential, Mass activity, Exchange current density | |
| - Rs, Specific activity, Onset overpotential, BET, Surface area | |
| - Loading, Apparent activation energy | |
| ### MCP Integration | |
| This service is also available as an MCP (Model Context Protocol) server, | |
| allowing integration with AI assistants like Claude. | |
| ### Credits | |
| Based on [MaTableGPT](https://github.com/your-repo/MaTableGPT) research. | |
| """) | |
| gr.Markdown("---\n*MaTableGPT MCP Service - Materials Science Table Data Extraction*") | |
| return app | |
| # ============================================================================= | |
| # Main Entry Point | |
| # ============================================================================= | |
| def main(): | |
| """Run the Gradio app.""" | |
| app = create_ui() | |
| # Get port from environment or default | |
| port = int(os.environ.get('GRADIO_SERVER_PORT', 7860)) | |
| app.launch( | |
| server_name="0.0.0.0", | |
| server_port=port, | |
| share=False | |
| ) | |
| if __name__ == "__main__": | |
| main() | |