SEUyishu commited on
Commit
1742f51
·
verified ·
1 Parent(s): dd5e12b

Upload 6 files

Browse files
Files changed (5) hide show
  1. Dockerfile +15 -12
  2. README.md +71 -22
  3. mcp_service.py +378 -0
  4. requirements.txt +24 -26
  5. start_mcp.py +35 -13
Dockerfile CHANGED
@@ -1,6 +1,6 @@
1
  # MaTableGPT MCP Service Docker Image
2
  # ====================================
3
- # For HuggingFace Spaces Deployment
4
 
5
  FROM python:3.10-slim
6
 
@@ -10,13 +10,17 @@ WORKDIR /app
10
  # Set environment variables
11
  ENV PYTHONDONTWRITEBYTECODE=1
12
  ENV PYTHONUNBUFFERED=1
13
- ENV GRADIO_SERVER_NAME=0.0.0.0
14
- ENV GRADIO_SERVER_PORT=7860
 
 
 
15
 
16
  # Install system dependencies
17
  RUN apt-get update && apt-get install -y --no-install-recommends \
18
  build-essential \
19
  git \
 
20
  && rm -rf /var/lib/apt/lists/*
21
 
22
  # Copy requirements first for better caching
@@ -27,7 +31,7 @@ RUN pip install --no-cache-dir --upgrade pip && \
27
  pip install --no-cache-dir -r requirements.txt
28
 
29
  # Download NLTK data for table splitting
30
- RUN python -c "import nltk; nltk.download('punkt')"
31
 
32
  # Copy application code
33
  COPY . .
@@ -38,13 +42,12 @@ RUN mkdir -p /app/sessions /app/temp
38
  # Set permissions for HuggingFace Spaces
39
  RUN chmod -R 777 /app/sessions /app/temp
40
 
41
- # Expose ports
42
- # 7860 for Gradio, 7865 for MCP SSE
43
- EXPOSE 7860 7865
44
 
45
- # Health check
46
- HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
47
- CMD python -c "import requests; requests.get('http://localhost:7860/')" || exit 1
48
 
49
- # Run the application
50
- CMD ["python", "app.py"]
 
1
  # MaTableGPT MCP Service Docker Image
2
  # ====================================
3
+ # For HuggingFace Spaces Deployment (SSE Mode)
4
 
5
  FROM python:3.10-slim
6
 
 
10
  # Set environment variables
11
  ENV PYTHONDONTWRITEBYTECODE=1
12
  ENV PYTHONUNBUFFERED=1
13
+
14
+ # MCP SSE Server Configuration
15
+ # HuggingFace Spaces 使用端口 7860
16
+ ENV MCP_HOST=0.0.0.0
17
+ ENV MCP_PORT=7860
18
 
19
  # Install system dependencies
20
  RUN apt-get update && apt-get install -y --no-install-recommends \
21
  build-essential \
22
  git \
23
+ curl \
24
  && rm -rf /var/lib/apt/lists/*
25
 
26
  # Copy requirements first for better caching
 
31
  pip install --no-cache-dir -r requirements.txt
32
 
33
  # Download NLTK data for table splitting
34
+ RUN python -c "import nltk; nltk.download('punkt')" || true
35
 
36
  # Copy application code
37
  COPY . .
 
42
  # Set permissions for HuggingFace Spaces
43
  RUN chmod -R 777 /app/sessions /app/temp
44
 
45
+ # Expose MCP SSE port (HuggingFace Spaces uses 7860)
46
+ EXPOSE 7860
 
47
 
48
+ # Health check for MCP SSE endpoint
49
+ HEALTHCHECK --interval=30s --timeout=30s --start-period=10s --retries=3 \
50
+ CMD curl -f http://localhost:7860/sse || exit 1
51
 
52
+ # Run MCP service in SSE mode
53
+ CMD ["python", "start_mcp.py", "--mode", "sse", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -35,11 +35,29 @@ A Model Context Protocol (MCP) service that extracts structured catalyst perform
35
  - Store representations and extractions
36
  - Export session data for analysis
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  ## 📦 Installation
39
 
40
  ### Prerequisites
41
  - Python 3.8+
42
- - OpenAI API key (for GPT extraction)
43
 
44
  ### Local Installation
45
 
@@ -94,22 +112,20 @@ This service supports third-party API services (reverse proxy, OneAPI, API aggre
94
 
95
  ## 🚀 Usage
96
 
97
- ### Start MCP Server (stdio mode)
98
 
99
  ```bash
 
100
  python start_mcp.py
101
- ```
102
-
103
- ### Start MCP Server (SSE mode for web integration)
104
 
105
- ```bash
106
- python start_mcp.py --mode sse --port 7865
107
  ```
108
 
109
- ### Start Gradio Web Interface
110
 
111
  ```bash
112
- python app.py
113
  ```
114
 
115
  ## 🔧 MCP Tools Reference
@@ -137,13 +153,26 @@ python app.py
137
  | `extract_catalyst_data_zero_shot` | Extract using zero-shot GPT |
138
  | `extract_catalyst_data_few_shot` | Extract with example pairs |
139
  | `extract_catalyst_data_fine_tuned` | Extract using fine-tuned model |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
 
141
  ### Utilities
142
 
143
  | Tool | Description |
144
  |------|-------------|
145
  | `list_performance_types` | List supported catalyst performance types |
146
- | `validate_extraction_result` | Validate extraction against schema |
147
  | `get_extraction_code_template` | Get Python code for local extraction |
148
  | `get_environment_requirements` | Get setup requirements |
149
 
@@ -204,46 +233,66 @@ session_data = get_session_data(session_id)
204
  docker build -t matablgpt-mcp .
205
  ```
206
 
207
- ### Run container
208
 
209
  ```bash
210
- docker run -p 7860:7860 -p 7865:7865 \
211
- -e OPENAI_API_KEY=your_key \
 
212
  matablgpt-mcp
213
  ```
214
 
215
  ## 🤗 HuggingFace Spaces Deployment
216
 
217
- 1. Create a new Space with Docker SDK
218
  2. Upload all files from `mcp_output/`
219
- 3. Add `OPENAI_API_KEY` as a secret in Space settings
220
- 4. Space will auto-build and deploy
 
 
 
 
221
 
222
  ## 📝 MCP Client Configuration
223
 
224
- Add to your MCP client configuration (e.g., Claude Desktop):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
 
226
  ```json
227
  {
228
  "mcpServers": {
229
  "matablgpt": {
230
  "command": "python",
231
- "args": ["path/to/mcp_output/start_mcp.py"],
232
  "env": {
233
- "OPENAI_API_KEY": "your_key"
 
234
  }
235
  }
236
  }
237
  }
238
  ```
239
 
240
- Or for SSE mode:
241
 
242
  ```json
243
  {
244
  "mcpServers": {
245
  "matablgpt": {
246
- "url": "http://localhost:7865/sse"
247
  }
248
  }
249
  }
@@ -273,7 +322,7 @@ Extracted data follows this JSON schema:
273
 
274
  ## 🙏 Acknowledgments
275
 
276
- Based on [MaTableGPT](https://github.com/your-repo/MaTableGPT) - GPT-based Table Data Extractor from Materials Science Literature.
277
 
278
  ## 📜 License
279
 
 
35
  - Store representations and extractions
36
  - Export session data for analysis
37
 
38
+ ## 🚀 Quick Start (HuggingFace Space SSE Mode)
39
+
40
+ This service runs as a **pure MCP SSE server** on HuggingFace Space, accessible via SSE endpoint.
41
+
42
+ **SSE Endpoint**: `https://your-space-name.hf.space/sse`
43
+
44
+ ### Connect from Cursor/Claude Desktop
45
+
46
+ ```json
47
+ {
48
+ "mcpServers": {
49
+ "matablgpt": {
50
+ "url": "https://your-space-name.hf.space/sse"
51
+ }
52
+ }
53
+ }
54
+ ```
55
+
56
  ## 📦 Installation
57
 
58
  ### Prerequisites
59
  - Python 3.8+
60
+ - OpenAI-compatible API key (for GPT extraction)
61
 
62
  ### Local Installation
63
 
 
112
 
113
  ## 🚀 Usage
114
 
115
+ ### Start MCP Server (SSE mode - Default for HuggingFace Space)
116
 
117
  ```bash
118
+ # Default: SSE mode on port 7860
119
  python start_mcp.py
 
 
 
120
 
121
+ # Custom port
122
+ python start_mcp.py --mode sse --port 8080
123
  ```
124
 
125
+ ### Start MCP Server (stdio mode - For local Cursor integration)
126
 
127
  ```bash
128
+ python start_mcp.py --mode stdio
129
  ```
130
 
131
  ## 🔧 MCP Tools Reference
 
153
  | `extract_catalyst_data_zero_shot` | Extract using zero-shot GPT |
154
  | `extract_catalyst_data_few_shot` | Extract with example pairs |
155
  | `extract_catalyst_data_fine_tuned` | Extract using fine-tuned model |
156
+ | `batch_extract_tables` | Extract from multiple tables in batch |
157
+
158
+ ### Follow-up & Refinement
159
+
160
+ | Tool | Description |
161
+ |------|-------------|
162
+ | `apply_follow_up_questions` | Refine extraction with iterative Q&A (from original MaTableGPT) |
163
+
164
+ ### Evaluation
165
+
166
+ | Tool | Description |
167
+ |------|-------------|
168
+ | `evaluate_extraction` | Compute Structure F1 Score and Value Accuracy |
169
+ | `validate_extraction_result` | Validate extraction against schema |
170
 
171
  ### Utilities
172
 
173
  | Tool | Description |
174
  |------|-------------|
175
  | `list_performance_types` | List supported catalyst performance types |
 
176
  | `get_extraction_code_template` | Get Python code for local extraction |
177
  | `get_environment_requirements` | Get setup requirements |
178
 
 
233
  docker build -t matablgpt-mcp .
234
  ```
235
 
236
+ ### Run container (SSE mode)
237
 
238
  ```bash
239
+ docker run -p 7860:7860 \
240
+ -e LLM_API_KEY=your_key \
241
+ -e LLM_API_BASE=https://api.your-service.com/v1 \
242
  matablgpt-mcp
243
  ```
244
 
245
  ## 🤗 HuggingFace Spaces Deployment
246
 
247
+ 1. Create a new Space with **Docker SDK**
248
  2. Upload all files from `mcp_output/`
249
+ 3. Add secrets in Space settings:
250
+ - `LLM_API_KEY`: Your API key
251
+ - `LLM_API_BASE`: Your API base URL (e.g., `https://api.your-service.com/v1`)
252
+ - `LLM_MODEL`: (Optional) Model name
253
+ 4. Space will auto-build and deploy the MCP SSE service
254
+ 5. Connect via: `https://your-space-name.hf.space/sse`
255
 
256
  ## 📝 MCP Client Configuration
257
 
258
+ ### For Cursor (SSE mode - HuggingFace Space)
259
+
260
+ Add to `~/.cursor/mcp.json`:
261
+
262
+ ```json
263
+ {
264
+ "mcpServers": {
265
+ "matablgpt": {
266
+ "url": "https://your-space-name.hf.space/sse"
267
+ }
268
+ }
269
+ }
270
+ ```
271
+
272
+ ### For Cursor (stdio mode - Local)
273
 
274
  ```json
275
  {
276
  "mcpServers": {
277
  "matablgpt": {
278
  "command": "python",
279
+ "args": ["F:/Material_Agent/MaTableGPT/mcp_output/start_mcp.py", "--mode", "stdio"],
280
  "env": {
281
+ "LLM_API_KEY": "your_key",
282
+ "LLM_API_BASE": "https://api.your-service.com/v1"
283
  }
284
  }
285
  }
286
  }
287
  ```
288
 
289
+ ### For Claude Desktop
290
 
291
  ```json
292
  {
293
  "mcpServers": {
294
  "matablgpt": {
295
+ "url": "https://your-space-name.hf.space/sse"
296
  }
297
  }
298
  }
 
322
 
323
  ## 🙏 Acknowledgments
324
 
325
+ Based on [MaTableGPT](https://github.com/KIST-CSRC/MaTableGPT) - GPT-based Table Data Extractor from Materials Science Literature.
326
 
327
  ## 📜 License
328
 
mcp_service.py CHANGED
@@ -1323,6 +1323,384 @@ print(json.dumps(json.loads(result), indent=2))
1323
  }
1324
 
1325
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1326
  @mcp.tool()
1327
  def get_environment_requirements() -> Dict:
1328
  """
 
1323
  }
1324
 
1325
 
1326
+ @mcp.tool()
1327
+ def apply_follow_up_questions(
1328
+ extraction_result: Dict,
1329
+ table_representation: str,
1330
+ session_id: str = "",
1331
+ table_name: str = ""
1332
+ ) -> Dict:
1333
+ """
1334
+ Apply follow-up questions to refine and validate extraction results.
1335
+
1336
+ This implements the iterative questioning process from the original MaTableGPT
1337
+ to improve extraction accuracy by:
1338
+ 1. Verifying catalyst names against the table
1339
+ 2. Checking performance types
1340
+ 3. Validating property values
1341
+ 4. Checking for reaction_type, electrolyte, substrate in title/caption
1342
+
1343
+ Args:
1344
+ extraction_result: Initial extraction result to refine
1345
+ table_representation: Original table representation for verification
1346
+ session_id: Optional session ID to save refined results
1347
+ table_name: Optional table name
1348
+
1349
+ Returns:
1350
+ Dictionary containing refined extraction result
1351
+ """
1352
+ try:
1353
+ extractor = get_extractor()
1354
+
1355
+ # Initialize message context
1356
+ system_prompt = """You need to modify the JSON representing the table.
1357
+ JSON template: {'catalyst_name': {'performance_name': {property_template}}}
1358
+ property_template: {'electrolyte': '', 'reaction_type': '', 'value': '', 'current_density': '', 'overpotential': '', 'potential': '', 'substrate': '', 'versus': '', 'condition': ''}
1359
+ performance_list = """ + str(GPTExtractor.PERFORMANCE_LIST) + """
1360
+ Replace 'catalyst_name' and 'performance_name' with actual names from the table."""
1361
+
1362
+ messages = [{"role": "system", "content": system_prompt}]
1363
+
1364
+ # Step 1: Verify catalysts in table
1365
+ verify_q = f"""<input representation>
1366
+ {table_representation}
1367
+
1368
+ Question 1: List all catalyst names in the table representation as a Python list. Only output the Python list."""
1369
+
1370
+ messages.append({"role": "user", "content": verify_q})
1371
+ response = extractor.client.chat.completions.create(
1372
+ model=extractor.get_model(),
1373
+ messages=messages,
1374
+ temperature=0
1375
+ )
1376
+ catalysts_in_table = response.choices[0].message.content.strip()
1377
+ messages.append({"role": "assistant", "content": catalysts_in_table})
1378
+
1379
+ # Step 2: Get catalysts from extraction
1380
+ extraction_catalysts_q = f"""<input json>
1381
+ {json.dumps(extraction_result)}
1382
+
1383
+ Question 2: List all catalyst names from the input json as a Python list. Only output the Python list."""
1384
+
1385
+ messages.append({"role": "user", "content": extraction_catalysts_q})
1386
+ response = extractor.client.chat.completions.create(
1387
+ model=extractor.get_model(),
1388
+ messages=messages,
1389
+ temperature=0
1390
+ )
1391
+ catalysts_in_json = response.choices[0].message.content.strip()
1392
+ messages.append({"role": "assistant", "content": catalysts_in_json})
1393
+
1394
+ # Step 3: Reconcile catalysts
1395
+ reconcile_q = """Question 3: Based on answers to Question 1 and 2, modify or remove any catalysts
1396
+ from Question 2 that don't match Question 1. Output the corrected Python list."""
1397
+
1398
+ messages.append({"role": "user", "content": reconcile_q})
1399
+ response = extractor.client.chat.completions.create(
1400
+ model=extractor.get_model(),
1401
+ messages=messages,
1402
+ temperature=0
1403
+ )
1404
+ reconciled_catalysts = response.choices[0].message.content.strip()
1405
+ messages.append({"role": "assistant", "content": reconciled_catalysts})
1406
+
1407
+ # Step 4: Check for title/caption info
1408
+ title_caption_q = f"""<input representation>
1409
+ {table_representation}
1410
+
1411
+ Question 4: Check the title and caption of the table.
1412
+ - Is there reaction type info (OER, HER, oxygen evolution, hydrogen evolution)?
1413
+ - Is there electrolyte info?
1414
+ - Is there substrate info?
1415
+ Answer in format: {{"reaction_type": "yes/no", "electrolyte": "yes/no", "substrate": "yes/no"}}"""
1416
+
1417
+ messages.append({"role": "user", "content": title_caption_q})
1418
+ response = extractor.client.chat.completions.create(
1419
+ model=extractor.get_model(),
1420
+ messages=messages,
1421
+ temperature=0
1422
+ )
1423
+ metadata_check = response.choices[0].message.content.strip()
1424
+ messages.append({"role": "assistant", "content": metadata_check})
1425
+
1426
+ # Step 5: Apply refinements
1427
+ refine_q = f"""<input json>
1428
+ {json.dumps(extraction_result)}
1429
+
1430
+ Based on the above analysis:
1431
+ 1. Keep only catalysts that exist in the table
1432
+ 2. Remove any 'NA', 'unknown', or empty values
1433
+ 3. If title/caption lacks reaction_type/electrolyte/substrate info, remove those keys
1434
+ 4. Output the refined JSON only. No explanation."""
1435
+
1436
+ messages.append({"role": "user", "content": refine_q})
1437
+ response = extractor.client.chat.completions.create(
1438
+ model=extractor.get_model(),
1439
+ messages=messages,
1440
+ temperature=0
1441
+ )
1442
+ refined_result = response.choices[0].message.content.strip()
1443
+
1444
+ # Parse result
1445
+ if "```" in refined_result:
1446
+ refined_result = refined_result.replace("```json", "").replace("```", "")
1447
+
1448
+ try:
1449
+ refined_json = json.loads(refined_result)
1450
+ except json.JSONDecodeError:
1451
+ refined_json = extraction_result # Fall back to original
1452
+
1453
+ # Save if session provided
1454
+ if session_id:
1455
+ extraction_record = ExtractionResult(
1456
+ session_id=session_id,
1457
+ table_name=table_name or "unnamed",
1458
+ model_type="follow-up-refined",
1459
+ result=refined_json,
1460
+ timestamp=datetime.now().isoformat(),
1461
+ follow_up_applied=True
1462
+ )
1463
+ session_manager.save_extraction(session_id, extraction_record)
1464
+
1465
+ return {
1466
+ "success": True,
1467
+ "original": extraction_result,
1468
+ "refined": refined_json,
1469
+ "follow_up_applied": True,
1470
+ "verification_steps": {
1471
+ "catalysts_in_table": catalysts_in_table,
1472
+ "catalysts_in_json": catalysts_in_json,
1473
+ "reconciled": reconciled_catalysts,
1474
+ "metadata_check": metadata_check
1475
+ }
1476
+ }
1477
+
1478
+ except Exception as e:
1479
+ return {
1480
+ "success": False,
1481
+ "error": str(e),
1482
+ "original": extraction_result,
1483
+ "follow_up_applied": False
1484
+ }
1485
+
1486
+
1487
+ @mcp.tool()
1488
+ def evaluate_extraction(
1489
+ prediction: Dict,
1490
+ ground_truth: Dict,
1491
+ evaluation_type: str = "both"
1492
+ ) -> Dict:
1493
+ """
1494
+ Evaluate extraction results against ground truth.
1495
+
1496
+ Computes metrics from the original MaTableGPT evaluation:
1497
+ - Structure F1 Score: Measures correctness of JSON structure
1498
+ - Value Accuracy: Measures correctness of extracted values
1499
+
1500
+ Args:
1501
+ prediction: The extracted/predicted result
1502
+ ground_truth: The expected correct result
1503
+ evaluation_type: "structure", "value", or "both"
1504
+
1505
+ Returns:
1506
+ Dictionary containing evaluation metrics
1507
+ """
1508
+ import re
1509
+ import unicodedata
1510
+
1511
+ def normalize_text(text: str) -> str:
1512
+ """Normalize text for comparison."""
1513
+ if not isinstance(text, str):
1514
+ return str(text)
1515
+ # Remove unicode variations
1516
+ text = unicodedata.normalize('NFKD', text)
1517
+ # Common substitutions
1518
+ text = re.sub(r'–|−', '-', text)
1519
+ text = re.sub(r'<sup>|</sup>', '', text)
1520
+ text = re.sub(r'm2 g−1', 'm2/g', text)
1521
+ text = re.sub(r'mA cm−2', 'mA/cm2', text)
1522
+ text = re.sub(r'\s+', '', text)
1523
+ return text.lower()
1524
+
1525
+ def get_all_keys(d: Dict, parent_key: str = '', sep: str = '//') -> List[str]:
1526
+ """Recursively get all keys from nested dict."""
1527
+ keys = []
1528
+ if isinstance(d, dict):
1529
+ for k, v in d.items():
1530
+ new_key = f"{parent_key}{sep}{k}" if parent_key else k
1531
+ keys.append(new_key)
1532
+ keys.extend(get_all_keys(v, new_key, sep))
1533
+ elif isinstance(d, list):
1534
+ for i, item in enumerate(d):
1535
+ keys.extend(get_all_keys(item, f"{parent_key}[{i}]", sep))
1536
+ return keys
1537
+
1538
+ def get_key_value_pairs(d: Dict, parent_key: str = '') -> List[tuple]:
1539
+ """Get all key-value pairs from nested dict."""
1540
+ pairs = []
1541
+ if isinstance(d, dict):
1542
+ for k, v in d.items():
1543
+ new_key = f"{parent_key}//{k}" if parent_key else k
1544
+ if isinstance(v, (dict, list)):
1545
+ pairs.extend(get_key_value_pairs(v, new_key))
1546
+ else:
1547
+ pairs.append((new_key, normalize_text(str(v))))
1548
+ elif isinstance(d, list):
1549
+ for i, item in enumerate(d):
1550
+ pairs.extend(get_key_value_pairs(item, f"{parent_key}[{i}]"))
1551
+ return pairs
1552
+
1553
+ results = {"success": True}
1554
+
1555
+ try:
1556
+ # Normalize both inputs
1557
+ pred_keys = get_all_keys(prediction)
1558
+ gt_keys = get_all_keys(ground_truth)
1559
+
1560
+ # Structure F1 Score
1561
+ if evaluation_type in ["structure", "both"]:
1562
+ # Remove 'condition' keys as per original
1563
+ pred_keys = [k for k in pred_keys if 'condition' not in k]
1564
+ gt_keys = [k for k in gt_keys if 'condition' not in k]
1565
+
1566
+ # Calculate TP, FP, FN for structure
1567
+ tp = len(set(pred_keys) & set(gt_keys))
1568
+ fp = len(set(pred_keys) - set(gt_keys))
1569
+ fn = len(set(gt_keys) - set(pred_keys))
1570
+
1571
+ if tp + fp + fn > 0:
1572
+ f1_score = tp / (tp + 0.5 * (fp + fn))
1573
+ else:
1574
+ f1_score = 1.0 if len(gt_keys) == 0 else 0.0
1575
+
1576
+ results["structure_f1"] = round(f1_score, 4)
1577
+ results["structure_details"] = {
1578
+ "true_positives": tp,
1579
+ "false_positives": fp,
1580
+ "false_negatives": fn,
1581
+ "matched_keys": list(set(pred_keys) & set(gt_keys))[:10], # Sample
1582
+ "missing_keys": list(set(gt_keys) - set(pred_keys))[:10],
1583
+ "extra_keys": list(set(pred_keys) - set(gt_keys))[:10]
1584
+ }
1585
+
1586
+ # Value Accuracy
1587
+ if evaluation_type in ["value", "both"]:
1588
+ pred_pairs = get_key_value_pairs(prediction)
1589
+ gt_pairs = get_key_value_pairs(ground_truth)
1590
+
1591
+ # Compare values
1592
+ correct = 0
1593
+ total = len(gt_pairs)
1594
+
1595
+ pred_dict = {k: v for k, v in pred_pairs}
1596
+
1597
+ for key, value in gt_pairs:
1598
+ if key in pred_dict:
1599
+ # Normalize and compare
1600
+ if normalize_text(pred_dict[key]) == normalize_text(value):
1601
+ correct += 1
1602
+
1603
+ value_accuracy = correct / total if total > 0 else 1.0
1604
+
1605
+ results["value_accuracy"] = round(value_accuracy, 4)
1606
+ results["value_details"] = {
1607
+ "correct_values": correct,
1608
+ "total_values": total,
1609
+ "accuracy_percentage": round(value_accuracy * 100, 2)
1610
+ }
1611
+
1612
+ # Overall score
1613
+ if evaluation_type == "both":
1614
+ results["overall_score"] = round(
1615
+ (results["structure_f1"] + results["value_accuracy"]) / 2, 4
1616
+ )
1617
+
1618
+ except Exception as e:
1619
+ results["success"] = False
1620
+ results["error"] = str(e)
1621
+
1622
+ return results
1623
+
1624
+
1625
+ @mcp.tool()
1626
+ def batch_extract_tables(
1627
+ tables: List[Dict],
1628
+ model_type: str = "zero-shot",
1629
+ apply_follow_up: bool = False,
1630
+ session_id: str = ""
1631
+ ) -> Dict:
1632
+ """
1633
+ Extract data from multiple tables in batch.
1634
+
1635
+ Args:
1636
+ tables: List of {"html": html_table, "title": title, "caption": caption, "name": table_name}
1637
+ model_type: "zero-shot", "few-shot", or "fine-tuning"
1638
+ apply_follow_up: Whether to apply follow-up questions for refinement
1639
+ session_id: Optional session ID
1640
+
1641
+ Returns:
1642
+ Dictionary containing all extraction results
1643
+ """
1644
+ if not session_id:
1645
+ session_id = session_manager.create_session()
1646
+
1647
+ results = {
1648
+ "success": True,
1649
+ "session_id": session_id,
1650
+ "total_tables": len(tables),
1651
+ "extractions": []
1652
+ }
1653
+
1654
+ for i, table_info in enumerate(tables):
1655
+ html = table_info.get("html", "")
1656
+ title = table_info.get("title", "")
1657
+ caption = table_info.get("caption", "")
1658
+ table_name = table_info.get("name", f"table_{i+1}")
1659
+
1660
+ try:
1661
+ # Convert to representation
1662
+ representation = table_representer.html_to_tsv(html, title, caption)
1663
+
1664
+ # Extract based on model type
1665
+ extractor = get_extractor()
1666
+ if model_type == "zero-shot":
1667
+ extraction = extractor.extract_zero_shot(representation)
1668
+ elif model_type == "few-shot":
1669
+ extraction = extractor.extract_few_shot(representation)
1670
+ else:
1671
+ extraction = {"error": "Fine-tuning requires model_name parameter"}
1672
+
1673
+ # Apply follow-up if requested
1674
+ if apply_follow_up and "error" not in extraction:
1675
+ from copy import deepcopy
1676
+ follow_up_result = apply_follow_up_questions(
1677
+ deepcopy(extraction),
1678
+ representation,
1679
+ session_id,
1680
+ table_name
1681
+ )
1682
+ if follow_up_result.get("success"):
1683
+ extraction = follow_up_result.get("refined", extraction)
1684
+
1685
+ results["extractions"].append({
1686
+ "table_name": table_name,
1687
+ "success": True,
1688
+ "extraction": extraction
1689
+ })
1690
+
1691
+ except Exception as e:
1692
+ results["extractions"].append({
1693
+ "table_name": table_name,
1694
+ "success": False,
1695
+ "error": str(e)
1696
+ })
1697
+
1698
+ results["successful_extractions"] = sum(1 for e in results["extractions"] if e["success"])
1699
+ results["failed_extractions"] = results["total_tables"] - results["successful_extractions"]
1700
+
1701
+ return results
1702
+
1703
+
1704
  @mcp.tool()
1705
  def get_environment_requirements() -> Dict:
1706
  """
requirements.txt CHANGED
@@ -1,26 +1,24 @@
1
- # MaTableGPT MCP Service Requirements
2
- # ====================================
3
-
4
- # Core MCP Framework
5
- mcp>=0.1.0
6
-
7
- # OpenAI-compatible API client
8
- openai>=1.0.0
9
-
10
- # HTML Parsing
11
- beautifulsoup4>=4.12.0
12
- lxml>=4.9.0
13
-
14
- # Data Processing
15
- pandas>=2.0.0
16
-
17
- # Web Framework for HuggingFace Space
18
- # Pin to stable version with compatible huggingface_hub
19
- gradio==4.44.0
20
- huggingface_hub>=0.24.0,<1.0.0
21
-
22
- # Async Support
23
- httpx>=0.25.0
24
-
25
- # Optional: For table splitting analysis
26
- nltk>=3.8.0
 
1
+ # MaTableGPT MCP Service Requirements
2
+ # ====================================
3
+
4
+ # Core MCP Framework (with SSE support)
5
+ mcp[cli]>=1.0.0
6
+
7
+ # OpenAI-compatible API client
8
+ openai>=1.0.0
9
+
10
+ # HTML Parsing
11
+ beautifulsoup4>=4.12.0
12
+ lxml>=4.9.0
13
+
14
+ # Data Processing
15
+ pandas>=2.0.0
16
+
17
+ # SSE/HTTP Support
18
+ starlette>=0.27.0
19
+ uvicorn>=0.23.0
20
+ sse-starlette>=1.6.0
21
+ httpx>=0.25.0
22
+
23
+ # Optional: For table splitting analysis
24
+ nltk>=3.8.0
 
 
start_mcp.py CHANGED
@@ -11,8 +11,15 @@ Usage:
11
 
12
  Arguments:
13
  --host Host address (default: 0.0.0.0)
14
- --port Port number (default: 7865)
15
- --mode Run mode: 'stdio' or 'sse' (default: stdio)
 
 
 
 
 
 
 
16
  """
17
 
18
  import os
@@ -35,13 +42,19 @@ def check_environment():
35
  """Check if required environment variables are set."""
36
  warnings = []
37
 
38
- if not os.environ.get('OPENAI_API_KEY'):
 
 
39
  warnings.append(
40
- "OPENAI_API_KEY not set. GPT extraction features will not work. "
41
- "Set it with: export OPENAI_API_KEY=your_key (Unix) or "
42
- "set OPENAI_API_KEY=your_key (Windows)"
43
  )
44
 
 
 
 
 
 
45
  return warnings
46
 
47
 
@@ -50,7 +63,7 @@ def check_dependencies():
50
  missing = []
51
 
52
  required = [
53
- ('mcp', 'mcp'),
54
  ('openai', 'openai'),
55
  ('bs4', 'beautifulsoup4'),
56
  ('pandas', 'pandas'),
@@ -68,25 +81,29 @@ def check_dependencies():
68
 
69
  def main():
70
  """Main entry point."""
 
 
 
 
71
  parser = argparse.ArgumentParser(
72
  description="MaTableGPT MCP Server - Table Data Extraction from Materials Science Literature"
73
  )
74
  parser.add_argument(
75
  '--host',
76
- default='0.0.0.0',
77
- help='Host address (default: 0.0.0.0)'
78
  )
79
  parser.add_argument(
80
  '--port',
81
  type=int,
82
- default=7865,
83
- help='Port number (default: 7865)'
84
  )
85
  parser.add_argument(
86
  '--mode',
87
  choices=['stdio', 'sse'],
88
- default='stdio',
89
- help='Run mode: stdio for standard I/O, sse for Server-Sent Events (default: stdio)'
90
  )
91
  parser.add_argument(
92
  '--debug',
@@ -119,6 +136,7 @@ def main():
119
  if args.mode == 'sse':
120
  logger.info(f"Host: {args.host}")
121
  logger.info(f"Port: {args.port}")
 
122
  logger.info("=" * 60)
123
 
124
  # Import and run MCP service
@@ -130,13 +148,17 @@ def main():
130
  mcp.run()
131
  else:
132
  logger.info(f"Starting MCP server in SSE mode on {args.host}:{args.port}...")
 
133
  mcp.run(transport='sse', host=args.host, port=args.port)
134
 
135
  except ImportError as e:
136
  logger.error(f"Failed to import MCP service: {e}")
 
137
  sys.exit(1)
138
  except Exception as e:
139
  logger.error(f"Error starting MCP server: {e}")
 
 
140
  sys.exit(1)
141
 
142
 
 
11
 
12
  Arguments:
13
  --host Host address (default: 0.0.0.0)
14
+ --port Port number (default: 7860)
15
+ --mode Run mode: 'stdio' or 'sse' (default: sse for HuggingFace Space)
16
+
17
+ Environment Variables:
18
+ LLM_API_KEY / OPENAI_API_KEY - API key for LLM service
19
+ LLM_API_BASE / OPENAI_API_BASE - Custom API base URL (for third-party services)
20
+ LLM_MODEL / OPENAI_MODEL - Model name (default: gpt-4-turbo-preview)
21
+ MCP_HOST - Server host (default: 0.0.0.0)
22
+ MCP_PORT - Server port (default: 7860)
23
  """
24
 
25
  import os
 
42
  """Check if required environment variables are set."""
43
  warnings = []
44
 
45
+ # Check for API key (support both naming conventions)
46
+ api_key = os.environ.get('LLM_API_KEY') or os.environ.get('OPENAI_API_KEY')
47
+ if not api_key:
48
  warnings.append(
49
+ "LLM_API_KEY/OPENAI_API_KEY not set. GPT extraction features will not work. "
50
+ "Set it in HuggingFace Space secrets or environment variables."
 
51
  )
52
 
53
+ # Check for API base (for third-party services)
54
+ api_base = os.environ.get('LLM_API_BASE') or os.environ.get('OPENAI_API_BASE')
55
+ if api_base:
56
+ logger.info(f"Using custom API base: {api_base}")
57
+
58
  return warnings
59
 
60
 
 
63
  missing = []
64
 
65
  required = [
66
+ ('mcp', 'mcp[cli]'),
67
  ('openai', 'openai'),
68
  ('bs4', 'beautifulsoup4'),
69
  ('pandas', 'pandas'),
 
81
 
82
  def main():
83
  """Main entry point."""
84
+ # Get default values from environment variables
85
+ default_host = os.environ.get('MCP_HOST', '0.0.0.0')
86
+ default_port = int(os.environ.get('MCP_PORT', '7860'))
87
+
88
  parser = argparse.ArgumentParser(
89
  description="MaTableGPT MCP Server - Table Data Extraction from Materials Science Literature"
90
  )
91
  parser.add_argument(
92
  '--host',
93
+ default=default_host,
94
+ help=f'Host address (default: {default_host})'
95
  )
96
  parser.add_argument(
97
  '--port',
98
  type=int,
99
+ default=default_port,
100
+ help=f'Port number (default: {default_port})'
101
  )
102
  parser.add_argument(
103
  '--mode',
104
  choices=['stdio', 'sse'],
105
+ default='sse',
106
+ help='Run mode: stdio for standard I/O, sse for Server-Sent Events (default: sse)'
107
  )
108
  parser.add_argument(
109
  '--debug',
 
136
  if args.mode == 'sse':
137
  logger.info(f"Host: {args.host}")
138
  logger.info(f"Port: {args.port}")
139
+ logger.info(f"SSE Endpoint: http://{args.host}:{args.port}/sse")
140
  logger.info("=" * 60)
141
 
142
  # Import and run MCP service
 
148
  mcp.run()
149
  else:
150
  logger.info(f"Starting MCP server in SSE mode on {args.host}:{args.port}...")
151
+ logger.info("MCP SSE service is ready to accept connections!")
152
  mcp.run(transport='sse', host=args.host, port=args.port)
153
 
154
  except ImportError as e:
155
  logger.error(f"Failed to import MCP service: {e}")
156
+ logger.error("Make sure mcp_service.py is in the same directory")
157
  sys.exit(1)
158
  except Exception as e:
159
  logger.error(f"Error starting MCP server: {e}")
160
+ import traceback
161
+ traceback.print_exc()
162
  sys.exit(1)
163
 
164