SEUyishu commited on
Commit
0187c48
·
verified ·
1 Parent(s): 5aaf4b5

Update mcp_output/mcp_plugin/mcp_service.py

Browse files
Files changed (1) hide show
  1. mcp_output/mcp_plugin/mcp_service.py +85 -10
mcp_output/mcp_plugin/mcp_service.py CHANGED
@@ -229,6 +229,35 @@ def _safe_extract_tar(archive: tarfile.TarFile, destination: str) -> None:
229
  shutil.copyfileobj(src, dst)
230
 
231
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
  # ============================================================================
233
  # 会话管理工具
234
  # ============================================================================
@@ -1097,7 +1126,10 @@ def process_structure_data(
1097
  target_index: int = 0,
1098
  graph_max_radius: float = 8.0,
1099
  graph_max_neighbors: int = 12,
1100
- reprocess: bool = False
 
 
 
1101
  ) -> dict:
1102
  """
1103
  Process atomic structure data into graph format.
@@ -1111,6 +1143,9 @@ def process_structure_data(
1111
  graph_max_radius (float): Maximum radius for edges in graph (default: 8.0).
1112
  graph_max_neighbors (int): Maximum number of neighbors per atom (default: 12).
1113
  reprocess (bool): Whether to reprocess data even if processed files exist.
 
 
 
1114
 
1115
  Returns:
1116
  dict: Contains processing status and dataset information.
@@ -1127,6 +1162,8 @@ def process_structure_data(
1127
 
1128
  # If structure_contents provided, create temp directory
1129
  temp_dir = None
 
 
1130
  if structure_contents is not None:
1131
  if targets_csv is None:
1132
  return {"success": False, "error": "targets_csv is required when providing structure_contents"}
@@ -1136,7 +1173,8 @@ def process_structure_data(
1136
 
1137
  # Write structure files
1138
  for filename, content in structure_contents.items():
1139
- filepath = os.path.join(temp_dir, filename)
 
1140
  with open(filepath, 'w') as f:
1141
  f.write(content)
1142
 
@@ -1145,20 +1183,33 @@ def process_structure_data(
1145
  f.write(targets_csv)
1146
 
1147
  data_path = temp_dir
 
 
 
 
 
 
 
 
 
 
1148
 
1149
  if data_path is None:
1150
  return {"success": False, "error": "Either data_path or structure_contents must be provided"}
1151
 
1152
  if not os.path.exists(data_path):
1153
  return {"success": False, "error": f"Data path not found: {data_path}"}
 
 
 
1154
 
1155
  processing_args = {
1156
  "dataset_type": "inmemory",
1157
  "data_path": data_path,
1158
  "target_path": "targets.csv",
1159
- "dictionary_source": "default",
1160
- "dictionary_path": "atom_dict.json",
1161
- "data_format": "json",
1162
  "verbose": "True",
1163
  "graph_max_radius": graph_max_radius,
1164
  "graph_max_neighbors": graph_max_neighbors,
@@ -1187,6 +1238,7 @@ def process_structure_data(
1187
  "data_path": data_path,
1188
  "is_temporary": temp_dir is not None
1189
  }
 
1190
 
1191
  # Note: Don't delete temp_dir yet, it may be needed for training
1192
  if temp_dir:
@@ -1819,7 +1871,10 @@ def process_session_data(
1819
  target_index: int = 0,
1820
  graph_max_radius: float = 8.0,
1821
  graph_max_neighbors: int = 12,
1822
- reprocess: bool = True
 
 
 
1823
  ) -> dict:
1824
  """
1825
  Process all uploaded structure files in a session into graph format.
@@ -1830,6 +1885,9 @@ def process_session_data(
1830
  graph_max_radius (float): Maximum radius for graph edges (default: 8.0 Angstrom).
1831
  graph_max_neighbors (int): Maximum neighbors per atom (default: 12).
1832
  reprocess (bool): Force reprocessing even if already processed (default: True).
 
 
 
1833
 
1834
  Returns:
1835
  dict: Processing status and dataset statistics.
@@ -1851,20 +1909,36 @@ def process_session_data(
1851
  "error": "targets.csv not found. Please upload targets using upload_targets first."
1852
  }
1853
 
1854
- files = [f for f in os.listdir(data_path) if f != "targets.csv" and not f.startswith('.')]
 
 
 
 
1855
  if len(files) == 0:
1856
  return {
1857
  "success": False,
1858
  "error": "No structure files found. Please upload structure files first."
1859
  }
1860
 
 
 
 
 
 
 
 
 
 
 
 
 
1861
  processing_args = {
1862
  "dataset_type": "inmemory",
1863
  "data_path": data_path,
1864
  "target_path": "targets.csv",
1865
- "dictionary_source": "default",
1866
- "dictionary_path": "atom_dict.json",
1867
- "data_format": "json",
1868
  "verbose": "True",
1869
  "graph_max_radius": graph_max_radius,
1870
  "graph_max_neighbors": graph_max_neighbors,
@@ -1897,6 +1971,7 @@ def process_session_data(
1897
  "avg_edges_per_structure": float(np.mean(num_edges_list)),
1898
  "num_node_features": dataset[0].x.shape[1] if len(dataset) > 0 else 0
1899
  },
 
1900
  "ready_for_training": True,
1901
  "next_step": "Use train_session_model to train a model on this data."
1902
  }
 
229
  shutil.copyfileobj(src, dst)
230
 
231
 
232
+ def _infer_structure_format(structure_files: List[str]) -> Optional[str]:
233
+ """Guess the structure file extension for ASE parsing."""
234
+ normalized: Dict[str, int] = {}
235
+ for file_name in structure_files:
236
+ root, ext = os.path.splitext(file_name)
237
+ if ext:
238
+ ext = ext.lower().lstrip('.')
239
+ else:
240
+ upper_name = os.path.basename(file_name).upper()
241
+ if upper_name in {"POSCAR", "CONTCAR"}:
242
+ ext = "vasp"
243
+ else:
244
+ ext = ""
245
+ if not ext:
246
+ continue
247
+ normalized[ext] = normalized.get(ext, 0) + 1
248
+ if not normalized:
249
+ return None
250
+ if len(normalized) == 1:
251
+ return next(iter(normalized))
252
+ # Prefer common solid-state formats
253
+ priority = ["cif", "vasp", "poscar", "xyz", "json"]
254
+ sorted_items = sorted(normalized.items(), key=lambda item: (-item[1], priority.index(item[0]) if item[0] in priority else 99))
255
+ top_ext, top_count = sorted_items[0]
256
+ if len(structure_files) == top_count:
257
+ return top_ext
258
+ return None
259
+
260
+
261
  # ============================================================================
262
  # 会话管理工具
263
  # ============================================================================
 
1126
  target_index: int = 0,
1127
  graph_max_radius: float = 8.0,
1128
  graph_max_neighbors: int = 12,
1129
+ reprocess: bool = False,
1130
+ data_format: Optional[str] = None,
1131
+ dictionary_source: str = "default",
1132
+ dictionary_path: str = "atom_dict.json"
1133
  ) -> dict:
1134
  """
1135
  Process atomic structure data into graph format.
 
1143
  graph_max_radius (float): Maximum radius for edges in graph (default: 8.0).
1144
  graph_max_neighbors (int): Maximum number of neighbors per atom (default: 12).
1145
  reprocess (bool): Whether to reprocess data even if processed files exist.
1146
+ data_format (str, optional): Explicit structure file format (e.g., 'cif', 'vasp', 'xyz', 'json').
1147
+ dictionary_source (str): Atom dictionary source ('default', 'blank', 'generated', 'provided').
1148
+ dictionary_path (str): Relative path to atom dictionary when dictionary_source='provided'.
1149
 
1150
  Returns:
1151
  dict: Contains processing status and dataset information.
 
1162
 
1163
  # If structure_contents provided, create temp directory
1164
  temp_dir = None
1165
+ determined_format = data_format.lower() if isinstance(data_format, str) else None
1166
+
1167
  if structure_contents is not None:
1168
  if targets_csv is None:
1169
  return {"success": False, "error": "targets_csv is required when providing structure_contents"}
 
1173
 
1174
  # Write structure files
1175
  for filename, content in structure_contents.items():
1176
+ clean_name = _normalize_filename(filename)
1177
+ filepath = _safe_join(temp_dir, clean_name)
1178
  with open(filepath, 'w') as f:
1179
  f.write(content)
1180
 
 
1183
  f.write(targets_csv)
1184
 
1185
  data_path = temp_dir
1186
+
1187
+ if determined_format is None:
1188
+ detected = _infer_structure_format(list(structure_contents.keys()))
1189
+ if detected:
1190
+ determined_format = detected
1191
+ elif determined_format is None:
1192
+ determined_format = _infer_structure_format([
1193
+ f for f in os.listdir(data_path)
1194
+ if os.path.isfile(os.path.join(data_path, f)) and f.lower() != "targets.csv"
1195
+ ])
1196
 
1197
  if data_path is None:
1198
  return {"success": False, "error": "Either data_path or structure_contents must be provided"}
1199
 
1200
  if not os.path.exists(data_path):
1201
  return {"success": False, "error": f"Data path not found: {data_path}"}
1202
+
1203
+ if determined_format is None:
1204
+ determined_format = "json"
1205
 
1206
  processing_args = {
1207
  "dataset_type": "inmemory",
1208
  "data_path": data_path,
1209
  "target_path": "targets.csv",
1210
+ "dictionary_source": dictionary_source,
1211
+ "dictionary_path": dictionary_path,
1212
+ "data_format": determined_format,
1213
  "verbose": "True",
1214
  "graph_max_radius": graph_max_radius,
1215
  "graph_max_neighbors": graph_max_neighbors,
 
1238
  "data_path": data_path,
1239
  "is_temporary": temp_dir is not None
1240
  }
1241
+ result["data_format"] = determined_format
1242
 
1243
  # Note: Don't delete temp_dir yet, it may be needed for training
1244
  if temp_dir:
 
1871
  target_index: int = 0,
1872
  graph_max_radius: float = 8.0,
1873
  graph_max_neighbors: int = 12,
1874
+ reprocess: bool = True,
1875
+ data_format: Optional[str] = None,
1876
+ dictionary_source: str = "default",
1877
+ dictionary_path: str = "atom_dict.json"
1878
  ) -> dict:
1879
  """
1880
  Process all uploaded structure files in a session into graph format.
 
1885
  graph_max_radius (float): Maximum radius for graph edges (default: 8.0 Angstrom).
1886
  graph_max_neighbors (int): Maximum neighbors per atom (default: 12).
1887
  reprocess (bool): Force reprocessing even if already processed (default: True).
1888
+ data_format (str, optional): Explicit structure format ('cif', 'vasp', 'xyz', 'json', ...). Use 'auto' to infer.
1889
+ dictionary_source (str): Atom dictionary source ('default', 'blank', 'generated', 'provided').
1890
+ dictionary_path (str): Path to atom dictionary within the session data when using 'provided'.
1891
 
1892
  Returns:
1893
  dict: Processing status and dataset statistics.
 
1909
  "error": "targets.csv not found. Please upload targets using upload_targets first."
1910
  }
1911
 
1912
+ skip_files = {"targets.csv", "id.json", "atom_dict.json"}
1913
+ files = [
1914
+ f for f in os.listdir(data_path)
1915
+ if f not in skip_files and not f.startswith('.')
1916
+ ]
1917
  if len(files) == 0:
1918
  return {
1919
  "success": False,
1920
  "error": "No structure files found. Please upload structure files first."
1921
  }
1922
 
1923
+ determined_format = data_format.lower() if isinstance(data_format, str) else None
1924
+ if determined_format in {"auto", "infer", ""}:
1925
+ determined_format = None
1926
+
1927
+ if determined_format is None:
1928
+ detected = _infer_structure_format(files)
1929
+ if detected:
1930
+ determined_format = detected
1931
+
1932
+ if determined_format is None:
1933
+ determined_format = "json"
1934
+
1935
  processing_args = {
1936
  "dataset_type": "inmemory",
1937
  "data_path": data_path,
1938
  "target_path": "targets.csv",
1939
+ "dictionary_source": dictionary_source,
1940
+ "dictionary_path": dictionary_path,
1941
+ "data_format": determined_format,
1942
  "verbose": "True",
1943
  "graph_max_radius": graph_max_radius,
1944
  "graph_max_neighbors": graph_max_neighbors,
 
1971
  "avg_edges_per_structure": float(np.mean(num_edges_list)),
1972
  "num_node_features": dataset[0].x.shape[1] if len(dataset) > 0 else 0
1973
  },
1974
+ "data_format": determined_format,
1975
  "ready_for_training": True,
1976
  "next_step": "Use train_session_model to train a model on this data."
1977
  }